diff --git a/.github/workflows/archiver.yml b/.github/workflows/archiver.yml index 3aaf9b35ed..5ac17d45a2 100644 --- a/.github/workflows/archiver.yml +++ b/.github/workflows/archiver.yml @@ -1,7 +1,7 @@ # Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Sep 2024) for the MG5aMC CUDACPP plugin. -# Further modified by: D. Massaro, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: D. Massaro, A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. #---------------------------------------------------------------------------------------------------------------------------------- diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 7dd6a2f963..72ffe64b17 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -1,3 +1,8 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, S. Roiser, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. + name: C/C++ CI on: diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h index 6a4b946e74..086aa6a616 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h @@ -1,4 +1,4 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index 703ea3781c..5ede45b123 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. // Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. @@ -166,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -193,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -208,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -220,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -314,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -341,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -363,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -385,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -403,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h index 8da04d7945..16f8874888 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h @@ -1,4 +1,4 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. // Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. @@ -8,9 +8,12 @@ #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h index deddc425f5..936ef7a7ff 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_%(model_name)s_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc new file mode 100644 index 0000000000..d2b24bba27 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc @@ -0,0 +1,418 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** +%(color_matrix_lines)s + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 32d12a5bba..22acd3abe9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -479,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -599,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -782,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -801,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -834,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -878,6 +931,7 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 @@ -979,6 +1033,7 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk index b4df265133..48b2037dc2 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 68bbf1b934..c32d0a2740 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc index 444c848e10..4c35c3eec6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc @@ -14,6 +14,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" %(hel_amps_h)s #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -23,6 +24,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc index 4e5e942a41..b0f0b44e26 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc @@ -50,6 +50,7 @@ static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = %(nbhel)d; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = %(ndiagrams)d; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = %(ncolor)s; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 76b6e773bd..0665bfb93b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -1,7 +1,7 @@ ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. ! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend. !========================================================================== -! Copyright (C) 2020-2024 CERN and UCLouvain. +! Copyright (C) 2020-2025 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. ! Further modified by: J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. @@ -16,9 +16,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -26,10 +27,7 @@ namespace mg5amcCpu using Parameters_%(model_name)s_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_%(model_name)s_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = %(ncolor)s; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -88,12 +86,58 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- @@ -117,8 +161,10 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif + + // Enable SIGFPE traps for Floating Point Exceptions #ifdef MGONGPUCPP_DEBUG - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + fpeEnable(); #endif } @@ -148,6 +194,10 @@ namespace mg5amcCpu //m_pars->printDependentCouplings(); // now computed event-by-event (running alphas #373) } %(initProc_lines)s +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory %(cipdassign)s @@ -183,6 +233,10 @@ namespace mg5amcCpu //Parameters_%(model_name)s::printDependentCouplings(); // now computed event-by-event (running alphas #373) } %(hardcoded_initProc_lines)s +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -303,8 +357,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -312,25 +366,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%%4d rndhel=%%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%%4d ihel=%%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -475,13 +707,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { %(den_factors)s }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc index 895b7ec1d6..7de8886b1d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc @@ -17,6 +17,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_%(model_name)s.h" #include @@ -46,7 +47,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -54,9 +55,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -76,34 +79,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc index 2700d7e7da..aac7506855 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc @@ -8,145 +8,43 @@ !========================================================================== // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called %(process_class_name)s::matrix_%(proc_name)s(%(matrix_args)s)?) -%(color_matrix_lines)s - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; -#else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index d49047a623..4372edde52 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -6,18 +6,23 @@ ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. ! Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. !========================================================================== - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -42,93 +47,30 @@ #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%%4d rndhel=%%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -170,7 +112,7 @@ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -193,7 +135,7 @@ // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -202,21 +144,23 @@ } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -230,8 +174,10 @@ for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -247,11 +193,12 @@ //printf( "sigmaKin: ievt=%%4d rndhel=%%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt2, ihelF ); break; } } @@ -353,14 +300,15 @@ #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 7d7996a674..3f8a85afa6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -1308,33 +1308,43 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): self.couplings2order = self.helas_call_writer.couplings2order self.params2order = self.helas_call_writer.params2order ret_lines.append(""" - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -1346,7 +1356,6 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -1355,14 +1364,17 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\\n", ihel );""") + //if( debug ) printf( \"calculate_jamps: ievt00=%d ihel=%2d\\n\", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( \"calculate_jamps: ievt=%6d ihel=%2d\\n\", ievt, ihel ); +#endif /* clang-format on */""") nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions() ret_lines.append(""" // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here @@ -1389,14 +1401,10 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif""") @@ -1413,6 +1421,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): file = self.get_matrix_single_process( i, me, color_amplitudes[i], class_name ) file = '\n'.join( file.split('\n')[8:] ) # skip first 8 lines in process_matrix.inc (copyright) file_extend.append( file ) + assert i == 0, "more than one ME in get_all_sigmaKin_lines" # AV sanity check (added for color_sum.cc but valid independently) ret_lines.extend( file_extend ) return '\n'.join(ret_lines) @@ -1442,7 +1451,7 @@ def generate_process_files(self): self.edit_check_sa() self.edit_mgonGPU() self.edit_processidfile() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses) - + self.edit_colorsum() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses) self.edit_testxxx() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) self.edit_memorybuffers() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) self.edit_memoryaccesscouplings() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) @@ -1523,6 +1532,17 @@ def edit_processidfile(self): ff.write(template % replace_dict) ff.close() + # AV - new method + def edit_colorsum(self): + """Generate color_sum.cc""" + ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_colorsum') + template = open(pjoin(self.template_path,'gpu','color_sum.cc'),'r').read() + replace_dict = {} + # Extract color matrix again (this was also in get_matrix_single_process called within get_all_sigmaKin_lines) + replace_dict['color_matrix_lines'] = self.get_color_matrix_lines(self.matrix_elements[0]) + ff = open(pjoin(self.path, 'color_sum.cc'),'w') + ff.write(template % replace_dict) + ff.close() def generate_subprocess_directory_end(self, **opt): """ opt contain all local variable of the fortran original function""" @@ -1693,11 +1713,11 @@ def get_color_matrix_lines(self, matrix_element): """Return the color matrix definition lines for this matrix element. Split rows in chunks of size n.""" import madgraph.core.color_algebra as color if not matrix_element.get('color_matrix'): - return '\n'.join([' static constexpr fptype2 denom[1] = {1.};', 'static const fptype2 cf[1][1] = {1.};']) + return '\n'.join([' static constexpr fptype2 colorDenom[1] = {1.};', 'static const fptype2 cf[1][1] = {1.};']) else: color_denominators = matrix_element.get('color_matrix').\ get_line_denominators() - denom_string = ' static constexpr fptype2 denom[ncolor] = { %s }; // 1-D array[%i]' \ + denom_string = ' static constexpr fptype2 colorDenom[ncolor] = { %s }; // 1-D array[%i]' \ % ( ', '.join(['%i' % denom for denom in color_denominators]), len(color_denominators) ) matrix_strings = [] my_cs = color.ColorString() @@ -1705,12 +1725,12 @@ def get_color_matrix_lines(self, matrix_element): # Then write the numerators for the matrix elements num_list = matrix_element.get('color_matrix').get_line_numerators(index, denominator) matrix_strings.append('{ %s }' % ', '.join(['%d' % i for i in num_list])) - matrix_string = ' static constexpr fptype2 cf[ncolor][ncolor] = ' - if len( matrix_strings ) > 1 : matrix_string += '{\n ' + ',\n '.join(matrix_strings) + ' };' + matrix_string = ' static constexpr fptype2 colorMatrix[ncolor][ncolor] = ' + if len( matrix_strings ) > 1 : matrix_string += '{\n ' + ',\n '.join(matrix_strings) + ' };' else: matrix_string += '{ ' + matrix_strings[0] + ' };' matrix_string += ' // 2-D array[%i][%i]' % ( len(color_denominators), len(color_denominators) ) - denom_comment = '\n // The color denominators (initialize all array elements, with ncolor=%i)\n // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators) - matrix_comment = '\n // The color matrix (initialize all array elements, with ncolor=%i)\n // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators) + denom_comment = '\n // The color denominators (initialize all array elements, with ncolor=%i)\n // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators) + matrix_comment = '\n // The color matrix (initialize all array elements, with ncolor=%i)\n // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators) denom_string = denom_comment + denom_string matrix_string = matrix_comment + matrix_string return '\n'.join([denom_string, matrix_string]) @@ -1905,7 +1925,6 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -1919,7 +1938,6 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -1930,6 +1948,10 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index 6562d189da..e54290d5a7 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -103,6 +103,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): s+'CMake/src/CMakeLists.txt' ], 'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h', s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h', + s+'gpu/color_sum.h', s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h', s+'gpu/MemoryAccessMatrixElements.h', s+'gpu/MemoryAccessMomenta.h', s+'gpu/MemoryAccessRandomNumbers.h', s+'gpu/MemoryAccessWeights.h', @@ -127,6 +128,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', + 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', diff --git a/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh b/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh index 097935efc8..00004df108 100755 --- a/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh @@ -8,38 +8,69 @@ set -e # fail on error cd $(dirname $0)/.. -./CODEGEN/generateAndCompare.sh -q ee_mumu -./CODEGEN/generateAndCompare.sh -q ee_mumu --mad +bsm= +while [ "$1" != "" ]; do + if [ "$1" == "-bsmonly" ] && [ "$bsm" != "-nobsm" ]; then + bsm=$1 + shift + elif [ "$1" == "-nobsm" ] && [ "$bsm" != "-bsmonly" ]; then + bsm=$1 + shift + else + echo "Usage: $0 [-bsmonly|-nobsm]" + fi +done -./CODEGEN/generateAndCompare.sh -q gg_tt -./CODEGEN/generateAndCompare.sh -q gg_tt --mad +# SM processes (both mad and sa) -./CODEGEN/generateAndCompare.sh -q gg_ttg -./CODEGEN/generateAndCompare.sh -q gg_ttg --mad +if [ "${bsm}" != "-bsmonly" ]; then -./CODEGEN/generateAndCompare.sh -q gg_ttgg -./CODEGEN/generateAndCompare.sh -q gg_ttgg --mad + ./CODEGEN/generateAndCompare.sh -q ee_mumu + ./CODEGEN/generateAndCompare.sh -q ee_mumu --mad -./CODEGEN/generateAndCompare.sh -q gg_ttggg -./CODEGEN/generateAndCompare.sh -q gg_ttggg --mad + ./CODEGEN/generateAndCompare.sh -q gg_tt + ./CODEGEN/generateAndCompare.sh -q gg_tt --mad -./CODEGEN/generateAndCompare.sh -q gq_ttq -./CODEGEN/generateAndCompare.sh -q gq_ttq --mad + ./CODEGEN/generateAndCompare.sh -q gg_ttg + ./CODEGEN/generateAndCompare.sh -q gg_ttg --mad -./CODEGEN/generateAndCompare.sh -q heft_gg_bb -./CODEGEN/generateAndCompare.sh -q heft_gg_bb --mad + ./CODEGEN/generateAndCompare.sh -q gg_ttgg + ./CODEGEN/generateAndCompare.sh -q gg_ttgg --mad -./CODEGEN/generateAndCompare.sh -q susy_gg_tt -./CODEGEN/generateAndCompare.sh -q susy_gg_tt --mad + ./CODEGEN/generateAndCompare.sh -q gg_ttggg + ./CODEGEN/generateAndCompare.sh -q gg_ttggg --mad -./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1 -./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1 --mad + ./CODEGEN/generateAndCompare.sh -q gq_ttq + ./CODEGEN/generateAndCompare.sh -q gq_ttq --mad -./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt -./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt --mad +fi -./CODEGEN/generateAndCompare.sh -q nobm_pp_ttW --mad +# BSM processes -./CODEGEN/generateAndCompare.sh -q gg_tt01g --mad +if [ "${bsm}" != "-nobsm" ]; then -./CODEGEN/generateAndCompare.sh -q pp_tt012j --mad + ./CODEGEN/generateAndCompare.sh -q heft_gg_bb + ./CODEGEN/generateAndCompare.sh -q heft_gg_bb --mad + + ./CODEGEN/generateAndCompare.sh -q susy_gg_tt + ./CODEGEN/generateAndCompare.sh -q susy_gg_tt --mad + + ./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1 + ./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1 --mad + + ./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt + ./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt --mad + + ./CODEGEN/generateAndCompare.sh -q nobm_pp_ttW --mad + +fi + +# SM processes (mad only) + +if [ "${bsm}" != "-bsmonly" ]; then + + ./CODEGEN/generateAndCompare.sh -q gg_tt01g --mad + + ./CODEGEN/generateAndCompare.sh -q pp_tt012j --mad + +fi diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh index 6221b1cfee..fd46fd38f3 100755 --- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh @@ -1,8 +1,8 @@ -#!/bin/bash -# Copyright (C) 2020-2024 CERN and UCLouvain. +#!/usr/bin/env bash +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. set -e # fail on error @@ -358,10 +358,12 @@ function codeGenAndDiff() fi done fi - # Remove card.jpg, diagrams.html and matrix*.jpg files (NB: these are only created if ghostscript is installed) + # Remove card.jpg/png, diagrams.html and matrix*.jpg/png files (NB: these are only created if ghostscript is installed) \rm -f ${outproc}/SubProcesses/P*/card.jpg + \rm -f ${outproc}/SubProcesses/P*/card.png \rm -f ${outproc}/SubProcesses/P*/diagrams.html \rm -f ${outproc}/SubProcesses/P*/matrix*jpg + \rm -f ${outproc}/SubProcesses/P*/matrix*png # Cleanup \rm -f ${outproc}/crossx.html \rm -f ${outproc}/index.html @@ -474,13 +476,6 @@ EOF if $SCRDIR/diffCode.sh ${BRIEF} -r -c ${proc}.${autosuffix}.BKP ${proc}.${autosuffix}; then echo "Old and new generated codes are identical"; else echo -e "\nWARNING! Old and new generated codes differ"; fi popd >& /dev/null fi - # Compare the existing manually developed code to the newly generated code for the specific process - if [ "${OUTBCK}" == "cudacpp" ] || [ "${OUTBCK}" == "gridpack" ]; then - pushd ${OUTDIR} >& /dev/null - echo -e "\n+++ Compare manually developed code to newly generated code for $proc\n" - if $SCRDIR/diffCode.sh ${BRIEF} -r -c ${proc} ${proc}.${autosuffix}; then echo "Manual and generated codes are identical"; else echo -e "\nWARNING! Manual and generated codes differ"; fi - popd >& /dev/null - fi # Print a summary of the available code if [ "$QUIET" != "1" ]; then echo diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index db84a9053c..f41ae1e58f 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +57,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006200551986694336  +DEBUG: model prefixing takes 0.008341312408447266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,21 +149,21 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.004 s +1 processes with 2 diagrams generated in 0.003 s Total: 1 processes with 2 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -176,22 +175,22 @@ FileWriter mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1552]  -Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.070 s +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  +Generated helas calls for 1 subprocesses (2 diagrams) in 0.005 s +Wrote files for 8 helas calls in 0.063 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.201 s +ALOHA: aloha creates 3 routines in 0.181 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.253 s +ALOHA: aloha creates 7 routines in 0.189 s FFV1 FFV1 FFV2 @@ -200,38 +199,32 @@ ALOHA: aloha creates 7 routines in 0.253 s FFV4 FFV2_4 FFV2_4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses/P1_epem_mupmum; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #2 succeeded at 236 (offset 9 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m2.054s -user 0m1.767s -sys 0m0.275s -Code generation completed in 2 seconds +real 0m2.357s +user 0m1.875s +sys 0m0.386s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -244,7 +237,7 @@ Code generation completed in 2 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -252,10 +245,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -274,7 +266,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -282,10 +274,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT +++ b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat index bb623f867a..2343b09819 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat index 74f70b567b..c1037c83d7 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat @@ -112,6 +112,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat index 68ee164d00..4ba7540657 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat @@ -112,6 +112,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt +++ b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/Source/.make_opts b/epochX/cudacpp/ee_mumu.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/.make_opts +++ b/epochX/cudacpp/ee_mumu.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f b/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc b/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc +++ b/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/ee_mumu.mad/Source/make_opts b/epochX/cudacpp/ee_mumu.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/make_opts +++ b/epochX/cudacpp/ee_mumu.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/ee_mumu.mad/Source/makefile b/epochX/cudacpp/ee_mumu.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/makefile +++ b/epochX/cudacpp/ee_mumu.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc b/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc index 80d5ae41aa..83061d9ae9 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc +++ b/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 7bd57a8dbb..624eb3e3d4 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 1; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -354,152 +410,43 @@ namespace mg5amcCpu jamp_sv[0] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_epem_mupmum()?) - - // The color denominators (initialize all array elements, with ncolor=1) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1 }; // 1-D array[1] - - // The color matrix (initialize all array elements, with ncolor=1) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -539,7 +486,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -572,6 +523,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ }; @@ -613,6 +568,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -733,8 +692,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -742,25 +701,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -905,13 +1042,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -923,18 +1054,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -959,93 +1095,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1087,7 +1160,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1110,7 +1183,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1119,21 +1192,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1147,8 +1222,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1164,11 +1241,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1270,14 +1348,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h index 159826a904..9339b0e34c 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 1; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f index 70fe04e4d8..3ce157a97e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f index 280eff025e..60bee2a1c7 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF EP1=PDG2PDF(LPP(IB(1)),-11, IB(1),XBK(IB(1)), QSCALE) IF (PDLABEL.EQ.'dressed') EP1_COMPONENTS(1:4 ) = @@ -149,7 +149,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF EM2=PDG2PDF(LPP(IB(2)),11, IB(2),XBK(IB(2)), QSCALE) IF (PDLABEL.EQ.'dressed') EM2_COMPONENTS(1:4 ) = @@ -228,7 +228,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -302,6 +302,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -385,14 +389,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) EP1(IVEC)=PDG2PDF(LPP(IB(1)),-11, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) IF (PDLABEL.EQ.'dressed') EP1_COMPONENTS(1:4 , IVEC) = $ EE_COMPONENTS(1:4) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) EM2(IVEC)=PDG2PDF(LPP(IB(2)),11, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) IF (PDLABEL.EQ.'dressed') EM2_COMPONENTS(1:4 , IVEC) = $ EE_COMPONENTS(1:4) ENDIF @@ -460,51 +464,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc new file mode 100644 index 0000000000..44aadd6b60 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc @@ -0,0 +1,425 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=1) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1 }; // 1-D array[1] + + // The color matrix (initialize all array elements, with ncolor=1) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/configs.inc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/configs.inc index b17a3fe72a..e42ad21d89 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/configs.inc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/configs.inc @@ -12,3 +12,5 @@ C Diagram 2 DATA TPRID(-1,2)/0/ C Number of configs DATA MAPCONFIG(0)/2/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f index ec5722702a..30cca27587 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fbridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/makefile_original.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f index 1a2e5df4e6..7cc484494b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -236,17 +233,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -325,7 +311,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -368,7 +354,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -411,17 +398,22 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 1) /1.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 1) /1/ C 1 ColorOne() C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WZ.NE.0D0) FK_MDL_WZ = SIGN(MAX(ABS(MDL_WZ), ABS(MDL_MZ - $ *SMALL_WIDTH_TREATMENT)), MDL_WZ) + FK_ZERO = 0D0 + IF(MDL_WZ.NE.0D0) THEN + FK_MDL_WZ = SIGN(MAX(ABS(MDL_WZ), ABS(MDL_MZ + $ *SMALL_WIDTH_TREATMENT)), MDL_WZ) + ELSE + FK_MDL_WZ = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -455,10 +447,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -467,6 +461,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data b/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/ee_mumu.mad/bin/madevent b/epochX/cudacpp/ee_mumu.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/madevent +++ b/epochX/cudacpp/ee_mumu.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h index 18f664e0d1..4dd98afc5d 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc index 37676c1d8d..dd3280eb5d 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h index 5fcde71f6b..0c43310313 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index f27925604a..96bc83705d 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +57,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006340742111206055  +DEBUG: model prefixing takes 0.004626750946044922  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,28 +153,28 @@ INFO: Process has 2 diagrams Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.267 s +ALOHA: aloha creates 4 routines in 0.213 s FFV1 FFV1 FFV2 @@ -184,17 +183,17 @@ ALOHA: aloha creates 4 routines in 0.267 s FFV4 FFV2_4 FFV2_4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.659s -user 0m0.589s -sys 0m0.056s +real 0m0.603s +user 0m0.531s +sys 0m0.067s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT +++ b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 16a91dd141..42f5c25dcb 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 1; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -352,152 +408,43 @@ namespace mg5amcCpu jamp_sv[0] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_epem_mupmum()?) - - // The color denominators (initialize all array elements, with ncolor=1) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1 }; // 1-D array[1] - - // The color matrix (initialize all array elements, with ncolor=1) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -537,7 +484,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -570,6 +521,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ }; @@ -611,6 +566,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -731,8 +690,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -740,25 +699,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -903,13 +1040,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -921,18 +1052,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -957,93 +1093,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1085,7 +1158,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1108,7 +1181,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1117,21 +1190,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1145,8 +1220,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1162,11 +1239,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1268,14 +1346,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h index 159826a904..9339b0e34c 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 1; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc new file mode 100644 index 0000000000..44aadd6b60 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc @@ -0,0 +1,425 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=1) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1 }; // 1-D array[1] + + // The color matrix (initialize all array elements, with ncolor=1) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fbridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/makefile_original.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h index 18f664e0d1..4dd98afc5d 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc index 37676c1d8d..dd3280eb5d 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h index 5fcde71f6b..0c43310313 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h index d3c4ca5695..7d34de72f8 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 453da8d298..f28f5709d8 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0065233707427978516  +DEBUG: model prefixing takes 0.004693746566772461  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,21 +150,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.007 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -177,53 +176,48 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.072 s +Wrote files for 10 helas calls in 0.064 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.144 s +ALOHA: aloha creates 2 routines in 0.115 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.132 s +ALOHA: aloha creates 4 routines in 0.131 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m1.991s -user 0m1.616s -sys 0m0.275s +real 0m2.156s +user 0m1.749s +sys 0m0.401s Code generation completed in 2 seconds ************************************************************ * * @@ -237,7 +231,7 @@ Code generation completed in 2 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -245,10 +239,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -267,7 +260,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -275,10 +268,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat index 66598786f5..404258ce86 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat index 6b82577032..000832aacd 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat index b8db871c35..85e1d39035 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/Source/.make_opts b/epochX/cudacpp/gg_tt.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/.make_opts +++ b/epochX/cudacpp/gg_tt.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/gg_tt.mad/Source/cuts.inc b/epochX/cudacpp/gg_tt.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/cuts.inc +++ b/epochX/cudacpp/gg_tt.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/gg_tt.mad/Source/make_opts b/epochX/cudacpp/gg_tt.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/make_opts +++ b/epochX/cudacpp/gg_tt.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/gg_tt.mad/Source/makefile b/epochX/cudacpp/gg_tt.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/makefile +++ b/epochX/cudacpp/gg_tt.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/gg_tt.mad/Source/run_card.inc b/epochX/cudacpp/gg_tt.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/run_card.inc +++ b/epochX/cudacpp/gg_tt.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index b32f4b931e..9a72b09e5a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -368,154 +424,43 @@ namespace mg5amcCpu jamp_sv[1] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -555,7 +500,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -588,6 +537,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -628,6 +581,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -748,8 +705,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -757,25 +714,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -920,13 +1055,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -938,18 +1067,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -974,93 +1108,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1102,7 +1173,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1125,7 +1196,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1134,21 +1205,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1162,8 +1235,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1179,11 +1254,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1285,14 +1361,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index feff1cc6e1..5d952c7419 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index bc9bcfeb9b..008afc92ae 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index db3c284caa..fc3ede89c4 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc new file mode 100644 index 0000000000..b68b9250fd --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc @@ -0,0 +1,427 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc index 99d3eecc56..0dbac30825 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc @@ -24,3 +24,5 @@ C Diagram 3 DATA (SPROP(I,-2,3),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/3/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f index ec5722702a..30cca27587 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fbridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/makefile_original.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f index 707ea40323..8481c73d0f 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -227,17 +224,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -307,7 +293,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -350,7 +336,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -393,21 +380,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /5.333333333333333D+00, - $ -6.666666666666666D-01/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 2) /16,-4/ C 1 T(1,2,3,4) - DATA (CF(I, 2),I= 1, 2) /-6.666666666666666D-01 - $ ,5.333333333333333D+00/ + DATA (CF(I),I= 3, 3) /16/ C 1 T(2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -446,10 +436,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -458,6 +450,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/files.py b/epochX/cudacpp/gg_tt.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/files.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data b/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/gg_tt.mad/bin/madevent b/epochX/cudacpp/gg_tt.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/madevent +++ b/epochX/cudacpp/gg_tt.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h index febf1dcf42..0561db9dc0 100644 --- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc index d09f387480..4772e6dc1d 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h index ba434e7b98..41fb70a23e 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 816b17272d..021fefaea7 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006091594696044922  +DEBUG: model prefixing takes 0.004730224609375  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,45 +150,45 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.007 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.144 s +ALOHA: aloha creates 2 routines in 0.123 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.544s -user 0m0.472s -sys 0m0.060s +real 0m0.525s +user 0m0.459s +sys 0m0.062s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/COPYRIGHT b/epochX/cudacpp/gg_tt.sa/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/gg_tt.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_tt.sa/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index 5c7a133eed..fe42002366 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -365,154 +421,43 @@ namespace mg5amcCpu jamp_sv[1] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -552,7 +497,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -585,6 +534,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -625,6 +578,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -745,8 +702,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -754,25 +711,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -917,13 +1052,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -935,18 +1064,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -971,93 +1105,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1099,7 +1170,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1122,7 +1193,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1131,21 +1202,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1159,8 +1232,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1176,11 +1251,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1282,14 +1358,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h index feff1cc6e1..5d952c7419 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc new file mode 100644 index 0000000000..b68b9250fd --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc @@ -0,0 +1,427 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fbridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/makefile_original.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h index febf1dcf42..0561db9dc0 100644 --- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc index d09f387480..4772e6dc1d 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h index ba434e7b98..41fb70a23e 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h index d3c4ca5695..7d34de72f8 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 6466d14e6d..74af92edcf 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00611424446105957  +DEBUG: model prefixing takes 0.005340576171875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.010 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -159,21 +158,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.019 s +1 processes with 16 diagrams generated in 0.024 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 INFO: Processing color information for process: g g > t t~ g @2 @@ -187,9 +186,9 @@ FileWriter t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1577]  INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -198,25 +197,25 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1552]  -Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s -Wrote files for 46 helas calls in 0.184 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  +Generated helas calls for 2 subprocesses (19 diagrams) in 0.050 s +Wrote files for 46 helas calls in 0.272 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.322 s +ALOHA: aloha creates 5 routines in 0.316 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.311 s +ALOHA: aloha creates 10 routines in 0.352 s VVV1 VVV1 FFV1 @@ -226,41 +225,32 @@ ALOHA: aloha creates 10 routines in 0.311 s VVVV1 VVVV3 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P2_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #2 succeeded at 243 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m2.583s -user 0m2.278s -sys 0m0.302s -Code generation completed in 3 seconds +real 0m3.896s +user 0m3.101s +sys 0m0.655s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * @@ -273,7 +263,7 @@ Code generation completed in 3 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -281,10 +271,9 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -303,7 +292,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -311,10 +300,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat index e50becb2d9..8728eabc9c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat index 1711d30371..d4c7c73e61 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat @@ -125,6 +125,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat index 364dbd21b0..730a05e322 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat @@ -125,6 +125,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts +++ b/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc b/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc +++ b/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts +++ b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/makefile b/epochX/cudacpp/gg_tt01g.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/makefile +++ b/epochX/cudacpp/gg_tt01g.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc b/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc index 2588190439..e169c1f193 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc +++ b/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index b32f4b931e..9a72b09e5a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -368,154 +424,43 @@ namespace mg5amcCpu jamp_sv[1] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -555,7 +500,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -588,6 +537,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -628,6 +581,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -748,8 +705,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -757,25 +714,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -920,13 +1055,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -938,18 +1067,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -974,93 +1108,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1102,7 +1173,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1125,7 +1196,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1134,21 +1205,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1162,8 +1235,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1179,11 +1254,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1285,14 +1361,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index feff1cc6e1..5d952c7419 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index bc9bcfeb9b..008afc92ae 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index db3c284caa..fc3ede89c4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc new file mode 100644 index 0000000000..b68b9250fd --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc @@ -0,0 +1,427 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/configs.inc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/configs.inc index 99d3eecc56..0dbac30825 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/configs.inc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/configs.inc @@ -24,3 +24,5 @@ C Diagram 3 DATA (SPROP(I,-2,3),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/3/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f index ec5722702a..30cca27587 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fbridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/makefile_original.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f index 707ea40323..8481c73d0f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -227,17 +224,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -307,7 +293,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -350,7 +336,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -393,21 +380,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /5.333333333333333D+00, - $ -6.666666666666666D-01/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 2) /16,-4/ C 1 T(1,2,3,4) - DATA (CF(I, 2),I= 1, 2) /-6.666666666666666D-01 - $ ,5.333333333333333D+00/ + DATA (CF(I),I= 3, 3) /16/ C 1 T(2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -446,10 +436,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -458,6 +450,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index c8b3dbf03c..3519cda091 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -567,158 +623,43 @@ namespace mg5amcCpu jamp_sv[5] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_gg_ttxg()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 64, -8, -8, 1, 1, 10 }, - { -8, 64, 1, 10, -8, 1 }, - { -8, 1, 64, -8, 10, 1 }, - { 1, 10, -8, 64, 1, -8 }, - { 1, -8, 10, 1, 64, -8 }, - { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -774,7 +715,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -808,6 +753,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -849,6 +798,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -969,8 +922,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -978,25 +931,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1141,13 +1272,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1159,18 +1284,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1195,93 +1325,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1323,7 +1390,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1346,7 +1413,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1355,21 +1422,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1383,8 +1452,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1400,11 +1471,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1506,14 +1578,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h index b583fc85fe..d7ce5daa6c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f index 8843b88a23..ae729ed904 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f index b22dde0f92..aecfa311e2 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc new file mode 100644 index 0000000000..9e3ce9d917 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc @@ -0,0 +1,431 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 64, -8, -8, 1, 1, 10 }, + { -8, 64, 1, 10, -8, 1 }, + { -8, 1, 64, -8, 10, 1 }, + { 1, 10, -8, 64, 1, -8 }, + { 1, -8, 10, 1, 64, -8 }, + { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/configs.inc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/configs.inc index 1eb9c578f9..a3ad3e22cf 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/configs.inc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/configs.inc @@ -171,3 +171,5 @@ C Diagram 15 DATA (SPROP(I,-3,15),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/15/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f index c2eadb2c31..aa93a3d195 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fbridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/makefile_original.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f index 7d44ae130e..6662900421 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -243,17 +240,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -323,7 +309,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -366,7 +352,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(9) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -409,43 +396,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /7.111111111111111D+00, - $ -8.888888888888888D-01,-8.888888888888888D-01 - $ ,1.111111111111111D-01,1.111111111111111D-01,1.111111111111111D - $ +00/ + DATA DENOM/9/ + DATA (CF(I),I= 1, 6) /64,-16,-16,2,2,20/ C 1 T(1,2,5,3,4) - DATA (CF(I, 2),I= 1, 6) /-8.888888888888888D-01 - $ ,7.111111111111111D+00,1.111111111111111D-01,1.111111111111111D - $ +00,-8.888888888888888D-01,1.111111111111111D-01/ + DATA (CF(I),I= 7, 11) /64,2,20,-16,2/ C 1 T(1,5,2,3,4) - DATA (CF(I, 3),I= 1, 6) /-8.888888888888888D-01 - $ ,1.111111111111111D-01,7.111111111111111D+00, - $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D - $ -01/ + DATA (CF(I),I= 12, 15) /64,-16,20,2/ C 1 T(2,1,5,3,4) - DATA (CF(I, 4),I= 1, 6) /1.111111111111111D-01 - $ ,1.111111111111111D+00,-8.888888888888888D-01 - $ ,7.111111111111111D+00,1.111111111111111D-01, - $ -8.888888888888888D-01/ + DATA (CF(I),I= 16, 18) /64,2,-16/ C 1 T(2,5,1,3,4) - DATA (CF(I, 5),I= 1, 6) /1.111111111111111D-01, - $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D - $ -01,7.111111111111111D+00,-8.888888888888888D-01/ + DATA (CF(I),I= 19, 20) /64,-16/ C 1 T(5,1,2,3,4) - DATA (CF(I, 6),I= 1, 6) /1.111111111111111D+00 - $ ,1.111111111111111D-01,1.111111111111111D-01, - $ -8.888888888888888D-01,-8.888888888888888D-01 - $ ,7.111111111111111D+00/ + DATA (CF(I),I= 21, 21) /64/ C 1 T(5,2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -549,10 +525,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -561,6 +539,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data b/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/madevent b/epochX/cudacpp/gg_tt01g.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/madevent +++ b/epochX/cudacpp/gg_tt01g.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h index ff9f0d7f00..a18c3a4ea2 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc index 47a3a011b8..a5e188e4f8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h index 76066c7bb1..24e0e80f84 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index c216de0edd..2247620ea0 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +57,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006003379821777344  +DEBUG: model prefixing takes 0.004524707794189453  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,21 +150,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.021 s +1 processes with 16 diagrams generated in 0.018 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -177,25 +176,25 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1552]  -Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s -Wrote files for 36 helas calls in 0.123 s +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1577]  +Generated helas calls for 1 subprocesses (16 diagrams) in 0.045 s +Wrote files for 36 helas calls in 0.142 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.331 s +ALOHA: aloha creates 5 routines in 0.245 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.310 s +ALOHA: aloha creates 10 routines in 0.234 s VVV1 VVV1 FFV1 @@ -205,38 +204,32 @@ ALOHA: aloha creates 10 routines in 0.310 s VVVV1 VVVV3 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #2 succeeded at 243 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m2.439s -user 0m2.135s -sys 0m0.297s -Code generation completed in 2 seconds +real 0m2.613s +user 0m2.182s +sys 0m0.417s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -249,7 +242,7 @@ Code generation completed in 2 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -257,10 +250,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -279,7 +271,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -287,10 +279,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat index 3ace6e558c..cd6d16fc93 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat index d087670827..a16ea5dee6 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat index 43e93cbf40..cdcd77f36d 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/Source/.make_opts b/epochX/cudacpp/gg_ttg.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/.make_opts +++ b/epochX/cudacpp/gg_ttg.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc b/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc +++ b/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/gg_ttg.mad/Source/make_opts b/epochX/cudacpp/gg_ttg.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttg.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/gg_ttg.mad/Source/makefile b/epochX/cudacpp/gg_ttg.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/makefile +++ b/epochX/cudacpp/gg_ttg.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc b/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc +++ b/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 5de1c626c8..037b031386 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -567,158 +623,43 @@ namespace mg5amcCpu jamp_sv[5] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxg()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 64, -8, -8, 1, 1, 10 }, - { -8, 64, 1, 10, -8, 1 }, - { -8, 1, 64, -8, 10, 1 }, - { 1, 10, -8, 64, 1, -8 }, - { 1, -8, 10, 1, 64, -8 }, - { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -774,7 +715,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -808,6 +753,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -849,6 +798,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -969,8 +922,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -978,25 +931,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1141,13 +1272,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1159,18 +1284,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1195,93 +1325,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1323,7 +1390,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1346,7 +1413,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1355,21 +1422,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1383,8 +1452,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1400,11 +1471,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1506,14 +1578,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 2acfa000a7..69d8ea8b08 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f index 10496aa04d..19937ed005 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f index 7c8695090c..9e5f9c9b0a 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc new file mode 100644 index 0000000000..9e3ce9d917 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc @@ -0,0 +1,431 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 64, -8, -8, 1, 1, 10 }, + { -8, 64, 1, 10, -8, 1 }, + { -8, 1, 64, -8, 10, 1 }, + { 1, 10, -8, 64, 1, -8 }, + { 1, -8, 10, 1, 64, -8 }, + { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/configs.inc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/configs.inc index 1eb9c578f9..a3ad3e22cf 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/configs.inc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/configs.inc @@ -171,3 +171,5 @@ C Diagram 15 DATA (SPROP(I,-3,15),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/15/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f index c2eadb2c31..aa93a3d195 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fbridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/makefile_original.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f index 797b19405d..48e24320cc 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -243,17 +240,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -323,7 +309,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -366,7 +352,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(9) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -409,43 +396,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /7.111111111111111D+00, - $ -8.888888888888888D-01,-8.888888888888888D-01 - $ ,1.111111111111111D-01,1.111111111111111D-01,1.111111111111111D - $ +00/ + DATA DENOM/9/ + DATA (CF(I),I= 1, 6) /64,-16,-16,2,2,20/ C 1 T(1,2,5,3,4) - DATA (CF(I, 2),I= 1, 6) /-8.888888888888888D-01 - $ ,7.111111111111111D+00,1.111111111111111D-01,1.111111111111111D - $ +00,-8.888888888888888D-01,1.111111111111111D-01/ + DATA (CF(I),I= 7, 11) /64,2,20,-16,2/ C 1 T(1,5,2,3,4) - DATA (CF(I, 3),I= 1, 6) /-8.888888888888888D-01 - $ ,1.111111111111111D-01,7.111111111111111D+00, - $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D - $ -01/ + DATA (CF(I),I= 12, 15) /64,-16,20,2/ C 1 T(2,1,5,3,4) - DATA (CF(I, 4),I= 1, 6) /1.111111111111111D-01 - $ ,1.111111111111111D+00,-8.888888888888888D-01 - $ ,7.111111111111111D+00,1.111111111111111D-01, - $ -8.888888888888888D-01/ + DATA (CF(I),I= 16, 18) /64,2,-16/ C 1 T(2,5,1,3,4) - DATA (CF(I, 5),I= 1, 6) /1.111111111111111D-01, - $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D - $ -01,7.111111111111111D+00,-8.888888888888888D-01/ + DATA (CF(I),I= 19, 20) /64,-16/ C 1 T(5,1,2,3,4) - DATA (CF(I, 6),I= 1, 6) /1.111111111111111D+00 - $ ,1.111111111111111D-01,1.111111111111111D-01, - $ -8.888888888888888D-01,-8.888888888888888D-01 - $ ,7.111111111111111D+00/ + DATA (CF(I),I= 21, 21) /64/ C 1 T(5,2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -549,10 +525,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -561,6 +539,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data b/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/gg_ttg.mad/bin/madevent b/epochX/cudacpp/gg_ttg.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/madevent +++ b/epochX/cudacpp/gg_ttg.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h index ff9f0d7f00..a18c3a4ea2 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc index 47a3a011b8..a5e188e4f8 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h index 76066c7bb1..24e0e80f84 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index 882c93c3a5..9c2ae753b6 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +57,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006311178207397461  +DEBUG: model prefixing takes 0.0047342777252197266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,33 +150,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.018 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. +Generated helas calls for 1 subprocesses (16 diagrams) in 0.034 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.323 s +ALOHA: aloha creates 5 routines in 0.247 s VVV1 VVV1 FFV1 @@ -187,17 +186,17 @@ ALOHA: aloha creates 5 routines in 0.323 s VVVV1 VVVV3 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.791s -user 0m0.727s -sys 0m0.049s +real 0m0.729s +user 0m0.664s +sys 0m0.058s Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index bf77ac9970..896544668f 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -561,158 +617,43 @@ namespace mg5amcCpu jamp_sv[5] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxg()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 64, -8, -8, 1, 1, 10 }, - { -8, 64, 1, 10, -8, 1 }, - { -8, 1, 64, -8, 10, 1 }, - { 1, 10, -8, 64, 1, -8 }, - { 1, -8, 10, 1, 64, -8 }, - { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -768,7 +709,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -802,6 +747,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -843,6 +792,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -963,8 +916,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -972,25 +925,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1135,13 +1266,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1153,18 +1278,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1189,93 +1319,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1317,7 +1384,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1340,7 +1407,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1349,21 +1416,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1377,8 +1446,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1394,11 +1465,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1500,14 +1572,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h index 2acfa000a7..69d8ea8b08 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc new file mode 100644 index 0000000000..9e3ce9d917 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc @@ -0,0 +1,431 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 64, -8, -8, 1, 1, 10 }, + { -8, 64, 1, 10, -8, 1 }, + { -8, 1, 64, -8, 10, 1 }, + { 1, 10, -8, 64, 1, -8 }, + { 1, -8, 10, 1, 64, -8 }, + { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fbridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/makefile_original.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h index ff9f0d7f00..a18c3a4ea2 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc index 47a3a011b8..a5e188e4f8 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h index 76066c7bb1..24e0e80f84 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h index d3c4ca5695..7d34de72f8 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 78cdfd68b2..6e14919193 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +57,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006547212600708008  +DEBUG: model prefixing takes 0.008086442947387695  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,21 +150,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.171 s +1 processes with 123 diagrams generated in 0.250 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -177,25 +176,25 @@ FileWriter t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1552]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s -Wrote files for 222 helas calls in 0.660 s +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1577]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.835 s +Wrote files for 222 helas calls in 1.049 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.328 s +ALOHA: aloha creates 5 routines in 0.627 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.313 s +ALOHA: aloha creates 10 routines in 0.561 s VVV1 VVV1 FFV1 @@ -208,38 +207,32 @@ ALOHA: aloha creates 10 routines in 0.313 s VVVV3 VVVV4 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses/P1_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #2 succeeded at 275 (offset 48 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m4.934s -user 0m3.516s -sys 0m0.277s -Code generation completed in 5 seconds +real 0m6.937s +user 0m6.044s +sys 0m0.744s +Code generation completed in 7 seconds ************************************************************ * * * W E L C O M E to * @@ -252,7 +245,7 @@ Code generation completed in 5 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -260,10 +253,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -282,7 +274,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -290,10 +282,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat index 1fa5e235b3..f27fba5d1a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat index ecdc7fd25c..964b954d74 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat index 7ec841d6c2..308f5bed4f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts +++ b/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc b/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc +++ b/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/makefile b/epochX/cudacpp/gg_ttgg.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/makefile +++ b/epochX/cudacpp/gg_ttgg.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc b/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc +++ b/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index c508e73f26..4272326385 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 24; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities -#endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) +#endif + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -2461,176 +2517,43 @@ namespace mg5amcCpu jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxgg()?) - - // The color denominators (initialize all array elements, with ncolor=24) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24] - - // The color matrix (initialize all array elements, with ncolor=24) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 }, - { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 }, - { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 }, - { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 }, - { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 }, - { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 }, - { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 }, - { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 }, - { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 }, - { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 }, - { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 }, - { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 }, - { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 }, - { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 }, - { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 }, - { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 }, - { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 }, - { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 }, - { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 }, - { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 }, - { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 }, - { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 }, - { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, - { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -2718,7 +2641,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -2753,6 +2680,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -2795,6 +2726,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -2915,8 +2850,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -2924,25 +2859,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -3087,13 +3200,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -3105,18 +3212,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -3141,93 +3253,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3269,7 +3318,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -3292,7 +3341,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -3301,21 +3350,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -3329,8 +3380,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -3346,11 +3399,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -3452,14 +3506,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h index 2b75e0f842..05c6aedfb3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 123; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 24; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f index c087f3f747..347686d1e9 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f index ce5493be9b..7e58e4577f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc new file mode 100644 index 0000000000..91a7f9998e --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc @@ -0,0 +1,449 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=24) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24] + + // The color matrix (initialize all array elements, with ncolor=24) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 }, + { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 }, + { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 }, + { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 }, + { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 }, + { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 }, + { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 }, + { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 }, + { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 }, + { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 }, + { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 }, + { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 }, + { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 }, + { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 }, + { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 }, + { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 }, + { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 }, + { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 }, + { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 }, + { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 }, + { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 }, + { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 }, + { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, + { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/configs.inc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/configs.inc index b50d3d5335..570419b5c0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/configs.inc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/configs.inc @@ -1530,3 +1530,5 @@ C Diagram 105 DATA (SPROP(I,-4,105),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/105/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fbridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/makefile_original.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f index 3ea53d8b21..5a966d34d4 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -275,17 +272,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -355,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -398,7 +384,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(155) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -441,407 +428,81 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ - DATA (CF(I, 1),I= 7, 12) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 1),I= 13, 18) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ - DATA (CF(I, 1),I= 19, 24) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ + DATA DENOM/54/ + DATA (CF(I),I= 1, 24) /512,-128,-128,16,16,160,-128,16,16,-2,-2 + $ ,-20,16,-2,160,-20,142,124,-2,-20,-20,124,124,-56/ C 1 T(1,2,5,6,3,4) - DATA (CF(I, 2),I= 1, 6) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ - DATA (CF(I, 2),I= 7, 12) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 2),I= 13, 18) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ - DATA (CF(I, 2),I= 19, 24) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ + DATA (CF(I),I= 25, 47) /512,16,160,-128,16,16,-128,-2,-20,16,-2, + $ -2,-20,-20,124,124,-56,16,-2,160,-20,142,124/ C 1 T(1,2,6,5,3,4) - DATA (CF(I, 3),I= 1, 6) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ - DATA (CF(I, 3),I= 7, 12) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ - DATA (CF(I, 3),I= 13, 18) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 3),I= 19, 24) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ + DATA (CF(I),I= 48, 69) /512,-128,160,16,16,-2,160,-20,142,124, + $ -128,16,16,-2,-2,-20,-20,-2,124,-56,-20,124/ C 1 T(1,5,2,6,3,4) - DATA (CF(I, 4),I= 1, 6) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 4),I= 7, 12) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ - DATA (CF(I, 4),I= 13, 18) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 4),I= 19, 24) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ + DATA (CF(I),I= 70, 90) /512,16,-128,-2,-20,-20,124,124,-56,16, + $ -128,-2,-20,16,-2,-2,16,142,124,160,-20/ C 1 T(1,5,6,2,3,4) - DATA (CF(I, 5),I= 1, 6) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ - DATA (CF(I, 5),I= 7, 12) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ - DATA (CF(I, 5),I= 13, 18) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 5),I= 19, 24) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ + DATA (CF(I),I= 91,110) /512,-128,-2,16,142,124,160,-20,-20,-2 + $ ,124,-56,-20,124,-128,16,16,-2,-2,-20/ C 1 T(1,6,2,5,3,4) - DATA (CF(I, 6),I= 1, 6) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ - DATA (CF(I, 6),I= 7, 12) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 6),I= 13, 18) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ - DATA (CF(I, 6),I= 19, 24) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ + DATA (CF(I),I=111,129) /512,-20,-2,124,-56,-20,124,-2,16,142,124 + $ ,160,-20,16,-128,-2,-20,16,-2/ C 1 T(1,6,5,2,3,4) - DATA (CF(I, 7),I= 1, 6) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 7),I= 7, 12) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ - DATA (CF(I, 7),I= 13, 18) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ - DATA (CF(I, 7),I= 19, 24) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ + DATA (CF(I),I=130,147) /512,-128,-128,16,16,160,160,-20,16,-2 + $ ,124,142,-20,124,-2,-20,-56,124/ C 1 T(2,1,5,6,3,4) - DATA (CF(I, 8),I= 1, 6) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 8),I= 7, 12) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ - DATA (CF(I, 8),I= 13, 18) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 8),I= 19, 24) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ + DATA (CF(I),I=148,164) /512,16,160,-128,16,-20,124,-2,-20,-56 + $ ,124,160,-20,16,-2,124,142/ C 1 T(2,1,6,5,3,4) - DATA (CF(I, 9),I= 1, 6) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ - DATA (CF(I, 9),I= 7, 12) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ - DATA (CF(I, 9),I= 13, 18) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 9),I= 19, 24) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ + DATA (CF(I),I=165,180) /512,-128,160,16,16,-2,-128,16,-20,-2,124 + $ ,-56,-20,-2,124,-20/ C 1 T(2,5,1,6,3,4) - DATA (CF(I, 10),I= 1, 6) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ - DATA (CF(I, 10),I= 7, 12) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 10),I= 13, 18) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 10),I= 19, 24) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ + DATA (CF(I),I=181,195) /512,16,-128,-2,-20,16,-128,-2,16,142,124 + $ ,-2,16,-20,160/ C 1 T(2,5,6,1,3,4) - DATA (CF(I, 11),I= 1, 6) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ - DATA (CF(I, 11),I= 7, 12) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ - DATA (CF(I, 11),I= 13, 18) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ - DATA (CF(I, 11),I= 19, 24) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ + DATA (CF(I),I=196,209) /512,-128,124,-56,-20,-2,124,-20,16,-2, + $ -128,16,-20,-2/ C 1 T(2,6,1,5,3,4) - DATA (CF(I, 12),I= 1, 6) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 12),I= 7, 12) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ - DATA (CF(I, 12),I= 13, 18) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ - DATA (CF(I, 12),I= 19, 24) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ + DATA (CF(I),I=210,222) /512,142,124,-2,16,-20,160,-2,-20,16,-128 + $ ,-2,16/ C 1 T(2,6,5,1,3,4) - DATA (CF(I, 13),I= 1, 6) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 13),I= 7, 12) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ - DATA (CF(I, 13),I= 13, 18) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ - DATA (CF(I, 13),I= 19, 24) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ + DATA (CF(I),I=223,234) /512,-128,-128,16,16,160,124,-20,-56,124, + $ -2,-20/ C 1 T(5,1,2,6,3,4) - DATA (CF(I, 14),I= 1, 6) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 14),I= 7, 12) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 14),I= 13, 18) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ - DATA (CF(I, 14),I= 19, 24) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ + DATA (CF(I),I=235,245) /512,16,160,-128,16,-20,160,124,142,16,-2/ C 1 T(5,1,6,2,3,4) - DATA (CF(I, 15),I= 1, 6) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ - DATA (CF(I, 15),I= 7, 12) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 15),I= 13, 18) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ - DATA (CF(I, 15),I= 19, 24) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ + DATA (CF(I),I=246,255) /512,-128,160,16,-56,124,124,-20,-20,-2/ C 1 T(5,2,1,6,3,4) - DATA (CF(I, 16),I= 1, 6) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 16),I= 7, 12) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 16),I= 13, 18) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 16),I= 19, 24) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ + DATA (CF(I),I=256,264) /512,16,-128,124,142,-20,160,-2,16/ C 1 T(5,2,6,1,3,4) - DATA (CF(I, 17),I= 1, 6) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ - DATA (CF(I, 17),I= 7, 12) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ - DATA (CF(I, 17),I= 13, 18) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ - DATA (CF(I, 17),I= 19, 24) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ + DATA (CF(I),I=265,272) /512,-128,-2,16,-20,-2,-128,16/ C 1 T(5,6,1,2,3,4) - DATA (CF(I, 18),I= 1, 6) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ - DATA (CF(I, 18),I= 7, 12) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ - DATA (CF(I, 18),I= 13, 18) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ - DATA (CF(I, 18),I= 19, 24) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ + DATA (CF(I),I=273,279) /512,-20,-2,-2,16,16,-128/ C 1 T(5,6,2,1,3,4) - DATA (CF(I, 19),I= 1, 6) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ - DATA (CF(I, 19),I= 7, 12) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ - DATA (CF(I, 19),I= 13, 18) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 19),I= 19, 24) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ + DATA (CF(I),I=280,285) /512,-128,-128,16,16,160/ C 1 T(6,1,2,5,3,4) - DATA (CF(I, 20),I= 1, 6) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 20),I= 7, 12) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 20),I= 13, 18) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ - DATA (CF(I, 20),I= 19, 24) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ + DATA (CF(I),I=286,290) /512,16,160,-128,16/ C 1 T(6,1,5,2,3,4) - DATA (CF(I, 21),I= 1, 6) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ - DATA (CF(I, 21),I= 7, 12) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ - DATA (CF(I, 21),I= 13, 18) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 21),I= 19, 24) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ + DATA (CF(I),I=291,294) /512,-128,160,16/ C 1 T(6,2,1,5,3,4) - DATA (CF(I, 22),I= 1, 6) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 22),I= 7, 12) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 22),I= 13, 18) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 22),I= 19, 24) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ + DATA (CF(I),I=295,297) /512,16,-128/ C 1 T(6,2,5,1,3,4) - DATA (CF(I, 23),I= 1, 6) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 23),I= 7, 12) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 23),I= 13, 18) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ - DATA (CF(I, 23),I= 19, 24) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ + DATA (CF(I),I=298,299) /512,-128/ C 1 T(6,5,1,2,3,4) - DATA (CF(I, 24),I= 1, 6) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 24),I= 7, 12) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 24),I= 13, 18) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 24),I= 19, 24) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ + DATA (CF(I),I=300,300) /512/ C 1 T(6,5,2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -1547,10 +1208,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -1559,6 +1222,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(2)=AMP2(2)+AMP(4)*DCONJG(AMP(4)) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data b/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/madevent b/epochX/cudacpp/gg_ttgg.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/madevent +++ b/epochX/cudacpp/gg_ttgg.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h index 53dd560ed6..da11e740d9 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc index 47a3a011b8..a5e188e4f8 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h index 76066c7bb1..24e0e80f84 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 7e5a3007eb..04760e59cb 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +57,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0059185028076171875  +DEBUG: model prefixing takes 0.005910158157348633  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,33 +150,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.156 s +1 processes with 123 diagrams generated in 0.162 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.422 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. +Generated helas calls for 1 subprocesses (123 diagrams) in 0.359 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.316 s +ALOHA: aloha creates 5 routines in 0.261 s VVV1 VVV1 FFV1 @@ -190,17 +189,17 @@ ALOHA: aloha creates 5 routines in 0.316 s VVVV3 VVVV4 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.455s -user 0m1.362s -sys 0m0.060s -Code generation completed in 1 seconds +real 0m1.337s +user 0m1.238s +sys 0m0.074s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index 5956559974..d50b7efcec 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 24; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities -#endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) +#endif + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -2518,176 +2574,43 @@ namespace mg5amcCpu jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxgg()?) - - // The color denominators (initialize all array elements, with ncolor=24) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24] - - // The color matrix (initialize all array elements, with ncolor=24) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 }, - { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 }, - { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 }, - { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 }, - { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 }, - { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 }, - { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 }, - { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 }, - { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 }, - { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 }, - { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 }, - { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 }, - { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 }, - { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 }, - { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 }, - { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 }, - { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 }, - { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 }, - { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 }, - { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 }, - { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 }, - { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 }, - { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, - { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -2775,7 +2698,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -2810,6 +2737,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -2852,6 +2783,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -2972,8 +2907,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -2981,25 +2916,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -3144,13 +3257,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -3162,18 +3269,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -3198,93 +3310,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3326,7 +3375,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -3349,7 +3398,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -3358,21 +3407,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -3386,8 +3437,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -3403,11 +3456,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -3509,14 +3563,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h index 2b75e0f842..05c6aedfb3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 123; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 24; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc new file mode 100644 index 0000000000..91a7f9998e --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc @@ -0,0 +1,449 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=24) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24] + + // The color matrix (initialize all array elements, with ncolor=24) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 }, + { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 }, + { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 }, + { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 }, + { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 }, + { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 }, + { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 }, + { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 }, + { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 }, + { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 }, + { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 }, + { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 }, + { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 }, + { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 }, + { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 }, + { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 }, + { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 }, + { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 }, + { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 }, + { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 }, + { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 }, + { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 }, + { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, + { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fbridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/makefile_original.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h index 53dd560ed6..da11e740d9 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc index 47a3a011b8..a5e188e4f8 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h index 76066c7bb1..24e0e80f84 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h index d3c4ca5695..7d34de72f8 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 1afa1ab2a5..bdc543f19d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +57,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005990028381347656  +DEBUG: model prefixing takes 0.004824638366699219  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,27 +150,27 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.890 s +1 processes with 1240 diagrams generated in 1.735 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 6s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h @@ -179,25 +178,25 @@ FileWriter t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1552]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.565 s -Wrote files for 2281 helas calls in 18.614 s +DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1577]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 5.829 s +Wrote files for 2281 helas calls in 15.156 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.373 s +ALOHA: aloha creates 5 routines in 0.258 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.313 s +ALOHA: aloha creates 10 routines in 0.293 s VVV1 VVV1 FFV1 @@ -210,38 +209,32 @@ ALOHA: aloha creates 10 routines in 0.313 s VVVV3 VVVV4 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses/P1_gg_ttxggg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #2 succeeded at 339 (offset 112 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m33.065s -user 0m32.263s -sys 0m0.459s -Code generation completed in 33 seconds +real 0m28.853s +user 0m28.097s +sys 0m0.568s +Code generation completed in 29 seconds ************************************************************ * * * W E L C O M E to * @@ -254,7 +247,7 @@ Code generation completed in 33 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -262,10 +255,9 @@ Code generation completed in 33 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -284,7 +276,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -292,10 +284,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat index cdd9d43b05..0125eda85b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat index a08f93d92b..596243d42e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat index 48050a5fd7..377d5bc1c7 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts +++ b/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc b/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc +++ b/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/makefile b/epochX/cudacpp/gg_ttggg.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/makefile +++ b/epochX/cudacpp/gg_ttggg.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc b/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc +++ b/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index ba06f6ff44..0548d00f74 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 120; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities -#endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) +#endif + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -29966,272 +30022,43 @@ namespace mg5amcCpu jamp_sv[116] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxggg()?) - - // The color denominators (initialize all array elements, with ncolor=120) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120] - - // The color matrix (initialize all array elements, with ncolor=120) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 }, - { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 }, - { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 }, - { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 }, - { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 }, - { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 }, - { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 }, - { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 }, - { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 }, - { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 }, - { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 }, - { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 }, - { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 }, - { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 }, - { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 }, - { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 }, - { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 }, - { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 }, - { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 }, - { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 }, - { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 }, - { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 }, - { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 }, - { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 }, - { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 }, - { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 }, - { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 }, - { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 }, - { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 }, - { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 }, - { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 }, - { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 }, - { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 }, - { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 }, - { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 }, - { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 }, - { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 }, - { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 }, - { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 }, - { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 }, - { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 }, - { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 }, - { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 }, - { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 }, - { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 }, - { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 }, - { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 }, - { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 }, - { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 }, - { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 }, - { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 }, - { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 }, - { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 }, - { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 }, - { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 }, - { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 }, - { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 }, - { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 }, - { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 }, - { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 }, - { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 }, - { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 }, - { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 }, - { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 }, - { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 }, - { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 }, - { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 }, - { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 }, - { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 }, - { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 }, - { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 }, - { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 }, - { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 }, - { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 }, - { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 }, - { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 }, - { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 }, - { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 }, - { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 }, - { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 }, - { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 }, - { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 }, - { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 }, - { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 }, - { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 }, - { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 }, - { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 }, - { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 }, - { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 }, - { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 }, - { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 }, - { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 }, - { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 }, - { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 }, - { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 }, - { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 }, - { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 }, - { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 }, - { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 }, - { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 }, - { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 }, - { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 }, - { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 }, - { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 }, - { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 }, - { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 }, - { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 }, - { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 }, - { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 }, - { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 }, - { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 }, - { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 }, - { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 }, - { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 }, - { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 }, - { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 }, - { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 }, - { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 }, - { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, - { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -30383,7 +30210,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -30419,6 +30250,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -30462,6 +30297,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -30582,8 +30421,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -30591,25 +30430,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -30754,13 +30771,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -30772,18 +30783,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -30808,93 +30824,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -30936,7 +30889,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -30959,7 +30912,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -30968,21 +30921,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -30996,8 +30951,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -31013,11 +30970,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -31119,14 +31077,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h index 2eb1e066ff..f20243637a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 128; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 1240; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 120; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f index 523ef1948b..e0c6371008 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f index 3152176aa0..3e9140b741 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc new file mode 100644 index 0000000000..dea7f9fdb2 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -0,0 +1,545 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=120) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120] + + // The color matrix (initialize all array elements, with ncolor=120) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 }, + { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 }, + { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 }, + { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 }, + { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 }, + { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 }, + { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 }, + { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 }, + { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 }, + { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 }, + { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 }, + { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 }, + { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 }, + { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 }, + { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 }, + { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 }, + { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 }, + { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 }, + { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 }, + { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 }, + { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 }, + { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 }, + { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 }, + { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 }, + { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 }, + { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 }, + { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 }, + { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 }, + { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 }, + { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 }, + { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 }, + { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 }, + { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 }, + { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 }, + { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 }, + { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 }, + { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 }, + { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 }, + { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 }, + { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 }, + { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 }, + { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 }, + { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 }, + { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 }, + { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 }, + { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 }, + { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 }, + { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 }, + { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 }, + { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 }, + { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 }, + { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 }, + { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 }, + { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 }, + { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 }, + { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 }, + { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 }, + { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 }, + { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 }, + { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 }, + { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 }, + { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 }, + { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 }, + { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 }, + { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 }, + { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 }, + { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 }, + { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 }, + { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 }, + { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 }, + { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 }, + { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 }, + { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 }, + { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 }, + { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 }, + { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 }, + { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 }, + { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 }, + { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 }, + { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 }, + { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 }, + { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 }, + { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 }, + { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 }, + { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 }, + { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 }, + { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 }, + { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 }, + { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 }, + { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 }, + { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 }, + { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 }, + { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 }, + { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 }, + { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 }, + { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 }, + { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 }, + { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 }, + { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 }, + { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 }, + { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 }, + { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 }, + { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 }, + { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 }, + { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 }, + { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 }, + { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 }, + { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 }, + { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 }, + { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 }, + { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 }, + { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 }, + { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 }, + { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 }, + { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 }, + { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 }, + { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 }, + { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 }, + { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, + { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/configs.inc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/configs.inc index cd0b177907..5d7030cc05 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/configs.inc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/configs.inc @@ -16695,3 +16695,5 @@ C Diagram 945 DATA (SPROP(I,-5,945),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/945/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f index 3671cdce55..c559e01778 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fbridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/makefile_original.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f index 07ccd4d1a4..72956c33dc 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -339,17 +336,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -419,7 +405,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -462,7 +448,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(3030) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -505,9375 +492,738 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 1),I= 7, 12) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 1),I= 13, 18) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 1),I= 19, 24) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 1),I= 25, 30) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 1),I= 31, 36) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 1),I= 37, 42) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 1),I= 43, 48) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 1),I= 49, 54) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 1),I= 55, 60) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 1),I= 61, 66) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 1),I= 67, 72) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 1),I= 73, 78) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 1),I= 79, 84) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 1),I= 85, 90) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 1),I= 91, 96) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 1),I= 97,102) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 1),I=103,108) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 1),I=109,114) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 1),I=115,120) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ + DATA DENOM/324/ + DATA (CF(I),I= 1,120) /4096,-1024,-1024,128,128,1280,-1024,128 + $ ,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160,992 + $ ,992,-448,-1024,128,128,-16,-16,-160,128,-16,-16,2,2,20,-16,2, + $ -160,20,-142,-124,2,20,20,-124,-124,56,128,-16,-16,2,2,20,1280, + $ -160,-160,20,20,200,1136,-142,992,-124,1010,1028,-142,38,-124, + $ -106,-268,-88,-16,2,-160,20,-142,-124,-160,20,992,-124,38,-106 + $ ,992,-124,-448,56,-268,-88,1010,-268,-268,884,884,-232,2,20,20, + $ -124,-124,56,20,200,-124,1028,-106,-88,-124,-106,56,-88,884, + $ -232,1028,-88,-88,-232,-232,272/ C 1 T(1,2,5,6,7,3,4) - DATA (CF(I, 2),I= 1, 6) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 2),I= 7, 12) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 2),I= 13, 18) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 2),I= 19, 24) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 2),I= 25, 30) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 2),I= 31, 36) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 2),I= 37, 42) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 2),I= 43, 48) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 2),I= 49, 54) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 2),I= 55, 60) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 2),I= 61, 66) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 2),I= 67, 72) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 2),I= 73, 78) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 2),I= 79, 84) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 2),I= 85, 90) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 2),I= 91, 96) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 2),I= 97,102) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 2),I=103,108) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 2),I=109,114) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 2),I=115,120) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ + DATA (CF(I),I=121,239) /4096,128,1280,-1024,128,128,-1024,-16, + $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136 + $ ,992,128,-1024,-16,-160,128,-16,-16,128,2,20,-16,2,2,20,20,-124 + $ ,-124,56,-16,2,-160,20,-142,-124,-16,128,2,20,-16,2,-160,1280 + $ ,20,200,-160,20,-142,38,-124,-106,-268,-88,1136,-142,992,-124 + $ ,1010,1028,2,20,20,-124,-124,56,20,200,-124,1028,-106,-88,-124, + $ -106,56,-88,884,-232,1028,-88,-88,-232,-232,272,-16,2,-160,20, + $ -142,-124,-160,20,992,-124,38,-106,992,-124,-448,56,-268,-88 + $ ,1010,-268,-268,884,884,-232/ C 1 T(1,2,5,7,6,3,4) - DATA (CF(I, 3),I= 1, 6) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 3),I= 7, 12) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 3),I= 13, 18) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 3),I= 19, 24) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 3),I= 25, 30) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 3),I= 31, 36) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 3),I= 37, 42) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 3),I= 43, 48) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 3),I= 49, 54) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 3),I= 55, 60) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 3),I= 61, 66) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 3),I= 67, 72) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 3),I= 73, 78) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 3),I= 79, 84) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 3),I= 85, 90) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 3),I= 91, 96) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 3),I= 97,102) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 3),I=103,108) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 3),I=109,114) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 3),I=115,120) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ + DATA (CF(I),I=240,357) /4096,-1024,1280,128,128,-16,1280,-160 + $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992 + $ ,128,-16,-1024,128,-160,-16,-16,2,-160,20,-142,-124,128,-16,-16 + $ ,2,2,20,20,2,-124,56,20,-124,-16,2,-160,20,-142,-124,-160,20 + $ ,992,-124,38,-106,992,-124,-448,56,-268,-88,1010,-268,-268,884 + $ ,884,-232,128,-16,-16,2,2,20,1280,-160,-160,20,20,200,1136,-142 + $ ,992,-124,1010,1028,-142,38,-124,-106,-268,-88,20,2,-124,56,20, + $ -124,200,20,-106,-88,-124,1028,1028,-88,-88,-232,-232,272,-124, + $ -106,56,-88,884,-232/ C 1 T(1,2,6,5,7,3,4) - DATA (CF(I, 4),I= 1, 6) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 4),I= 7, 12) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 4),I= 13, 18) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 4),I= 19, 24) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 4),I= 25, 30) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 4),I= 31, 36) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 4),I= 37, 42) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 4),I= 43, 48) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 4),I= 49, 54) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 4),I= 55, 60) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 4),I= 61, 66) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 4),I= 67, 72) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 4),I= 73, 78) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 4),I= 79, 84) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 4),I= 85, 90) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 4),I= 91, 96) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 4),I= 97,102) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 4),I=103,108) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 4),I=109,114) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 4),I=115,120) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ + DATA (CF(I),I=358,474) /4096,128,-1024,-16,-160,-160,992,992, + $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,-16, + $ -160,128,-1024,-16,128,2,20,20,-124,-124,56,-16,128,2,20,-16,2 + $ ,2,-16,-142,-124,-160,20,2,20,20,-124,-124,56,20,200,-124,1028, + $ -106,-88,-124,-106,56,-88,884,-232,1028,-88,-88,-232,-232,272, + $ -16,128,2,20,-16,2,-160,1280,20,200,-160,20,-142,38,-124,-106, + $ -268,-88,1136,-142,992,-124,1010,1028,2,-16,-142,-124,-160,20 + $ ,20,-160,38,-106,992,-124,1010,-268,-268,884,884,-232,992,-124, + $ -448,56,-268,-88/ C 1 T(1,2,6,7,5,3,4) - DATA (CF(I, 5),I= 1, 6) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 5),I= 7, 12) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 5),I= 13, 18) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 5),I= 19, 24) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 5),I= 25, 30) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 5),I= 31, 36) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 5),I= 37, 42) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 5),I= 43, 48) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 5),I= 49, 54) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 5),I= 55, 60) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 5),I= 61, 66) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 5),I= 67, 72) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 5),I= 73, 78) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 5),I= 79, 84) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 5),I= 85, 90) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 5),I= 91, 96) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 5),I= 97,102) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 5),I=103,108) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 5),I=109,114) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 5),I=115,120) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ + DATA (CF(I),I=475,590) /4096,-1024,-16,128,1136,992,1280,-160, + $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,-16,128, + $ -160,-16,-1024,128,2,-16,-142,-124,-160,20,20,2,-124,56,20,-124 + $ ,128,-16,-16,2,2,20,2,-16,-142,-124,-160,20,20,-160,38,-106,992 + $ ,-124,1010,-268,-268,884,884,-232,992,-124,-448,56,-268,-88,20 + $ ,2,-124,56,20,-124,200,20,-106,-88,-124,1028,1028,-88,-88,-232, + $ -232,272,-124,-106,56,-88,884,-232,128,-16,-16,2,2,20,1280,-160 + $ ,-160,20,20,200,1136,-142,992,-124,1010,1028,-142,38,-124,-106, + $ -268,-88/ C 1 T(1,2,7,5,6,3,4) - DATA (CF(I, 6),I= 1, 6) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 6),I= 7, 12) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 6),I= 13, 18) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 6),I= 19, 24) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 6),I= 25, 30) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 6),I= 31, 36) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 6),I= 37, 42) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 6),I= 43, 48) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 6),I= 49, 54) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 6),I= 55, 60) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 6),I= 61, 66) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 6),I= 67, 72) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 6),I= 73, 78) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 6),I= 79, 84) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 6),I= 85, 90) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 6),I= 91, 96) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 6),I= 97,102) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 6),I=103,108) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 6),I=109,114) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 6),I=115,120) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ + DATA (CF(I),I=591,705) /4096,-160,-16,992,-448,-160,992,-16,128 + $ ,1136,992,1280,-160,128,-1024,-16,-160,128,-16,-160,-16,-16,128 + $ ,128,-1024,20,2,-124,56,20,-124,2,-16,-142,-124,-160,20,-16,128 + $ ,2,20,-16,2,20,2,-124,56,20,-124,200,20,-106,-88,-124,1028,1028 + $ ,-88,-88,-232,-232,272,-124,-106,56,-88,884,-232,2,-16,-142, + $ -124,-160,20,20,-160,38,-106,992,-124,1010,-268,-268,884,884, + $ -232,992,-124,-448,56,-268,-88,-16,128,2,20,-16,2,-160,1280,20 + $ ,200,-160,20,-142,38,-124,-106,-268,-88,1136,-142,992,-124,1010 + $ ,1028/ C 1 T(1,2,7,6,5,3,4) - DATA (CF(I, 7),I= 1, 6) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 7),I= 7, 12) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 7),I= 13, 18) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 7),I= 19, 24) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 7),I= 25, 30) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 7),I= 31, 36) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 7),I= 37, 42) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 7),I= 43, 48) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 7),I= 49, 54) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 7),I= 55, 60) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 7),I= 61, 66) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 7),I= 67, 72) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 7),I= 73, 78) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 7),I= 79, 84) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 7),I= 85, 90) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 7),I= 91, 96) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 7),I= 97,102) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 7),I=103,108) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 7),I=109,114) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 7),I=115,120) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ + DATA (CF(I),I=706,819) /4096,-1024,-1024,128,128,1280,1280,-160 + $ ,128,-16,992,1136,-160,992,-16,-160,-448,992,128,-16,-16,2,2,20 + $ ,1280,-160,-160,20,20,200,1136,-142,992,-124,1010,1028,-142,38, + $ -124,-106,-268,-88,-1024,128,128,-16,-16,-160,128,-16,-16,2,2 + $ ,20,-16,2,-160,20,-142,-124,2,20,20,-124,-124,56,-160,20,-16,2, + $ -124,-142,992,-124,-448,56,-268,-88,-160,20,992,-124,38,-106, + $ -268,1010,884,-232,-268,884,20,-124,2,20,56,-124,-124,-106,56, + $ -88,884,-232,20,200,-124,1028,-106,-88,-88,1028,-232,272,-88, + $ -232/ C 1 T(1,5,2,6,7,3,4) - DATA (CF(I, 8),I= 1, 6) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 8),I= 7, 12) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 8),I= 13, 18) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 8),I= 19, 24) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 8),I= 25, 30) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 8),I= 31, 36) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 8),I= 37, 42) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 8),I= 43, 48) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 8),I= 49, 54) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 8),I= 55, 60) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 8),I= 61, 66) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 8),I= 67, 72) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 8),I= 73, 78) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 8),I= 79, 84) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 8),I= 85, 90) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 8),I= 91, 96) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 8),I= 97,102) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 8),I=103,108) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 8),I=109,114) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 8),I=115,120) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ + DATA (CF(I),I=820,932) /4096,128,1280,-1024,128,-160,992,-16, + $ -160,-448,992,1280,-160,128,-16,992,1136,-16,128,2,20,-16,2, + $ -160,1280,20,200,-160,20,-142,38,-124,-106,-268,-88,1136,-142 + $ ,992,-124,1010,1028,128,-1024,-16,-160,128,-16,-16,128,2,20,-16 + $ ,2,2,20,20,-124,-124,56,-16,2,-160,20,-142,-124,20,-124,2,20,56 + $ ,-124,-124,-106,56,-88,884,-232,20,200,-124,1028,-106,-88,-88 + $ ,1028,-232,272,-88,-232,-160,20,-16,2,-124,-142,992,-124,-448 + $ ,56,-268,-88,-160,20,992,-124,38,-106,-268,1010,884,-232,-268 + $ ,884/ C 1 T(1,5,2,7,6,3,4) - DATA (CF(I, 9),I= 1, 6) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 9),I= 7, 12) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 9),I= 13, 18) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 9),I= 19, 24) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 9),I= 25, 30) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 9),I= 31, 36) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 9),I= 37, 42) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 9),I= 43, 48) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 9),I= 49, 54) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 9),I= 55, 60) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 9),I= 61, 66) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 9),I= 67, 72) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 9),I= 73, 78) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 9),I= 79, 84) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 9),I= 85, 90) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 9),I= 91, 96) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 9),I= 97,102) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 9),I=103,108) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 9),I=109,114) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 9),I=115,120) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ + DATA (CF(I),I=933,1044) /4096,-1024,1280,128,128,-16,-1024,128, + $ -160,-16,992,-448,-160,-16,992,-160,-16,2,-160,20,-142,-124, + $ -160,20,992,-124,38,-106,992,-124,-448,56,-268,-88,1010,-268, + $ -268,884,884,-232,128,-16,-1024,128,-160,-16,-16,2,-160,20,-142 + $ ,-124,128,-16,-16,2,2,20,20,2,-124,56,20,-124,-16,2,128,-16,20 + $ ,2,1136,-142,992,-124,1010,1028,1280,-160,-160,20,20,200,38, + $ -142,-268,-88,-124,-106,-124,56,20,2,-124,20,1028,-88,-88,-232, + $ -232,272,200,20,-106,-88,-124,1028,-106,-124,884,-232,56,-88/ C 1 T(1,5,6,2,7,3,4) - DATA (CF(I, 10),I= 1, 6) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 10),I= 7, 12) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 10),I= 13, 18) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 10),I= 19, 24) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 10),I= 25, 30) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 10),I= 31, 36) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 10),I= 37, 42) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 10),I= 43, 48) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 10),I= 49, 54) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 10),I= 55, 60) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 10),I= 61, 66) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 10),I= 67, 72) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 10),I= 73, 78) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 10),I= 79, 84) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 10),I= 85, 90) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 10),I= 91, 96) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 10),I= 97,102) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 10),I=103,108) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 10),I=109,114) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 10),I=115,120) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ + DATA (CF(I),I=1045,1155) /4096,128,-1024,-16,-160,128,-1024,-16 + $ ,128,1136,992,-16,128,-160,1280,2,20,20,-124,-124,56,20,200, + $ -124,1028,-106,-88,-124,-106,56,-88,884,-232,1028,-88,-88,-232, + $ -232,272,-16,-160,128,-1024,-16,128,2,20,20,-124,-124,56,-16 + $ ,128,2,20,-16,2,2,-16,-142,-124,-160,20,2,20,-16,128,2,-16,-142 + $ ,38,-124,-106,-268,-88,-160,1280,20,200,-160,20,-142,1136,1010 + $ ,1028,992,-124,-142,-124,2,-16,20,-160,1010,-268,-268,884,884, + $ -232,20,-160,38,-106,992,-124,-124,992,-268,-88,-448,56/ C 1 T(1,5,6,7,2,3,4) - DATA (CF(I, 11),I= 1, 6) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 11),I= 7, 12) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 11),I= 13, 18) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 11),I= 19, 24) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 11),I= 25, 30) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 11),I= 31, 36) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 11),I= 37, 42) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 11),I= 43, 48) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 11),I= 49, 54) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 11),I= 55, 60) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 11),I= 61, 66) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 11),I= 67, 72) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 11),I= 73, 78) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 11),I= 79, 84) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 11),I= 85, 90) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 11),I= 91, 96) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 11),I= 97,102) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 11),I=103,108) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 11),I=109,114) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 11),I=115,120) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ + DATA (CF(I),I=1156,1265) /4096,-1024,992,-448,-160,-16,992,-160 + $ ,128,-16,-1024,128,-160,-16,2,-16,-142,-124,-160,20,20,-160,38, + $ -106,992,-124,1010,-268,-268,884,884,-232,992,-124,-448,56,-268 + $ ,-88,-16,128,-160,-16,-1024,128,2,-16,-142,-124,-160,20,20,2, + $ -124,56,20,-124,128,-16,-16,2,2,20,-124,56,20,2,-124,20,1028, + $ -88,-88,-232,-232,272,200,20,-106,-88,-124,1028,-106,-124,884, + $ -232,56,-88,-16,2,128,-16,20,2,1136,-142,992,-124,1010,1028 + $ ,1280,-160,-160,20,20,200,38,-142,-268,-88,-124,-106/ C 1 T(1,5,7,2,6,3,4) - DATA (CF(I, 12),I= 1, 6) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 12),I= 7, 12) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 12),I= 13, 18) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 12),I= 19, 24) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 12),I= 25, 30) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 12),I= 31, 36) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 12),I= 37, 42) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 12),I= 43, 48) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 12),I= 49, 54) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 12),I= 55, 60) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 12),I= 61, 66) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 12),I= 67, 72) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 12),I= 73, 78) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 12),I= 79, 84) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 12),I= 85, 90) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 12),I= 91, 96) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 12),I= 97,102) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 12),I=103,108) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 12),I=109,114) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 12),I=115,120) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ + DATA (CF(I),I=1266,1374) /4096,1136,992,-16,128,-160,1280,-16, + $ -160,128,-1024,-16,128,20,2,-124,56,20,-124,200,20,-106,-88, + $ -124,1028,1028,-88,-88,-232,-232,272,-124,-106,56,-88,884,-232, + $ -160,-16,-16,128,128,-1024,20,2,-124,56,20,-124,2,-16,-142,-124 + $ ,-160,20,-16,128,2,20,-16,2,-142,-124,2,-16,20,-160,1010,-268, + $ -268,884,884,-232,20,-160,38,-106,992,-124,-124,992,-268,-88, + $ -448,56,2,20,-16,128,2,-16,-142,38,-124,-106,-268,-88,-160,1280 + $ ,20,200,-160,20,-142,1136,1010,1028,992,-124/ C 1 T(1,5,7,6,2,3,4) - DATA (CF(I, 13),I= 1, 6) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 13),I= 7, 12) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 13),I= 13, 18) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 13),I= 19, 24) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 13),I= 25, 30) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 13),I= 31, 36) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 13),I= 37, 42) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 13),I= 43, 48) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 13),I= 49, 54) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 13),I= 55, 60) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 13),I= 61, 66) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 13),I= 67, 72) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 13),I= 73, 78) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 13),I= 79, 84) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 13),I= 85, 90) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 13),I= 91, 96) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 13),I= 97,102) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 13),I=103,108) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 13),I=109,114) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 13),I=115,120) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ + DATA (CF(I),I=1375,1482) /4096,-1024,-1024,128,128,1280,992,-160 + $ ,-448,992,-16,-160,-16,2,128,-16,20,2,1136,-142,992,-124,1010 + $ ,1028,1280,-160,-160,20,20,200,38,-142,-268,-88,-124,-106,-160 + $ ,20,-16,2,-124,-142,992,-124,-448,56,-268,-88,-160,20,992,-124 + $ ,38,-106,-268,1010,884,-232,-268,884,-1024,128,128,-16,-16,-160 + $ ,128,-16,-16,2,2,20,-16,2,-160,20,-142,-124,2,20,20,-124,-124 + $ ,56,-124,20,56,-124,2,20,-106,-124,884,-232,56,-88,-88,1028, + $ -232,272,-88,-232,20,200,-124,1028,-106,-88/ C 1 T(1,6,2,5,7,3,4) - DATA (CF(I, 14),I= 1, 6) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 14),I= 7, 12) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 14),I= 13, 18) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 14),I= 19, 24) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 14),I= 25, 30) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 14),I= 31, 36) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 14),I= 37, 42) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 14),I= 43, 48) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 14),I= 49, 54) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 14),I= 55, 60) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 14),I= 61, 66) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 14),I= 67, 72) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 14),I= 73, 78) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 14),I= 79, 84) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 14),I= 85, 90) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 14),I= 91, 96) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 14),I= 97,102) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 14),I=103,108) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 14),I=109,114) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 14),I=115,120) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ + DATA (CF(I),I=1483,1589) /4096,128,1280,-1024,128,-160,1280,992 + $ ,1136,128,-16,2,20,-16,128,2,-16,-142,38,-124,-106,-268,-88, + $ -160,1280,20,200,-160,20,-142,1136,1010,1028,992,-124,20,-124,2 + $ ,20,56,-124,-124,-106,56,-88,884,-232,20,200,-124,1028,-106,-88 + $ ,-88,1028,-232,272,-88,-232,128,-1024,-16,-160,128,-16,-16,128 + $ ,2,20,-16,2,2,20,20,-124,-124,56,-16,2,-160,20,-142,-124,20, + $ -160,-124,-142,-16,2,-124,992,-268,-88,-448,56,-268,1010,884, + $ -232,-268,884,-160,20,992,-124,38,-106/ C 1 T(1,6,2,7,5,3,4) - DATA (CF(I, 15),I= 1, 6) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 15),I= 7, 12) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 15),I= 13, 18) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 15),I= 19, 24) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 15),I= 25, 30) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 15),I= 31, 36) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 15),I= 37, 42) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 15),I= 43, 48) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 15),I= 49, 54) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 15),I= 55, 60) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 15),I= 61, 66) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 15),I= 67, 72) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 15),I= 73, 78) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 15),I= 79, 84) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 15),I= 85, 90) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 15),I= 91, 96) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 15),I= 97,102) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 15),I=103,108) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 15),I=109,114) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 15),I=115,120) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ + DATA (CF(I),I=1590,1695) /4096,-1024,1280,128,-448,992,992,-160, + $ -160,-16,-160,20,-16,2,-124,-142,992,-124,-448,56,-268,-88,-160 + $ ,20,992,-124,38,-106,-268,1010,884,-232,-268,884,-16,2,128,-16 + $ ,20,2,1136,-142,992,-124,1010,1028,1280,-160,-160,20,20,200,38, + $ -142,-268,-88,-124,-106,128,-16,-1024,128,-160,-16,-16,2,-160 + $ ,20,-142,-124,128,-16,-16,2,2,20,20,2,-124,56,20,-124,56,-124, + $ -124,20,20,2,-88,1028,-232,272,-88,-232,-106,-124,884,-232,56, + $ -88,200,20,-106,-88,-124,1028/ C 1 T(1,6,5,2,7,3,4) - DATA (CF(I, 16),I= 1, 6) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 16),I= 7, 12) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 16),I= 13, 18) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 16),I= 19, 24) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 16),I= 25, 30) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 16),I= 31, 36) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 16),I= 37, 42) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 16),I= 43, 48) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 16),I= 49, 54) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 16),I= 55, 60) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 16),I= 61, 66) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 16),I= 67, 72) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 16),I= 73, 78) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 16),I= 79, 84) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 16),I= 85, 90) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 16),I= 91, 96) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 16),I= 97,102) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 16),I=103,108) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 16),I=109,114) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 16),I=115,120) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ + DATA (CF(I),I=1696,1800) /4096,128,-1024,992,1136,-160,1280,-16 + $ ,128,20,-124,2,20,56,-124,-124,-106,56,-88,884,-232,20,200,-124 + $ ,1028,-106,-88,-88,1028,-232,272,-88,-232,2,20,-16,128,2,-16, + $ -142,38,-124,-106,-268,-88,-160,1280,20,200,-160,20,-142,1136 + $ ,1010,1028,992,-124,-16,-160,128,-1024,-16,128,2,20,20,-124, + $ -124,56,-16,128,2,20,-16,2,2,-16,-142,-124,-160,20,-124,-142,20 + $ ,-160,2,-16,-268,1010,884,-232,-268,884,-124,992,-268,-88,-448 + $ ,56,20,-160,38,-106,992,-124/ C 1 T(1,6,5,7,2,3,4) - DATA (CF(I, 17),I= 1, 6) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 17),I= 7, 12) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 17),I= 13, 18) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 17),I= 19, 24) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 17),I= 25, 30) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 17),I= 31, 36) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 17),I= 37, 42) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 17),I= 43, 48) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 17),I= 49, 54) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 17),I= 55, 60) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 17),I= 61, 66) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 17),I= 67, 72) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 17),I= 73, 78) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 17),I= 79, 84) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 17),I= 85, 90) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 17),I= 91, 96) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 17),I= 97,102) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 17),I=103,108) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 17),I=109,114) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 17),I=115,120) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ + DATA (CF(I),I=1801,1904) /4096,-1024,-16,128,-160,-16,-1024,128, + $ -142,-124,2,-16,20,-160,1010,-268,-268,884,884,-232,20,-160,38, + $ -106,992,-124,-124,992,-268,-88,-448,56,-124,56,20,2,-124,20 + $ ,1028,-88,-88,-232,-232,272,200,20,-106,-88,-124,1028,-106,-124 + $ ,884,-232,56,-88,-16,128,-160,-16,-1024,128,2,-16,-142,-124, + $ -160,20,20,2,-124,56,20,-124,128,-16,-16,2,2,20,2,-16,20,2,128, + $ -16,-142,1136,1010,1028,992,-124,38,-142,-268,-88,-124,-106 + $ ,1280,-160,-160,20,20,200/ C 1 T(1,6,7,2,5,3,4) - DATA (CF(I, 18),I= 1, 6) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 18),I= 7, 12) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 18),I= 13, 18) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 18),I= 19, 24) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 18),I= 25, 30) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 18),I= 31, 36) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 18),I= 37, 42) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 18),I= 43, 48) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 18),I= 49, 54) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 18),I= 55, 60) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 18),I= 61, 66) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 18),I= 67, 72) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 18),I= 73, 78) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 18),I= 79, 84) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 18),I= 85, 90) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 18),I= 91, 96) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 18),I= 97,102) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 18),I=103,108) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 18),I=109,114) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 18),I=115,120) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ + DATA (CF(I),I=1905,2007) /4096,-160,-16,-16,128,128,-1024,-124 + $ ,56,20,2,-124,20,1028,-88,-88,-232,-232,272,200,20,-106,-88, + $ -124,1028,-106,-124,884,-232,56,-88,-142,-124,2,-16,20,-160 + $ ,1010,-268,-268,884,884,-232,20,-160,38,-106,992,-124,-124,992, + $ -268,-88,-448,56,-160,-16,-16,128,128,-1024,20,2,-124,56,20, + $ -124,2,-16,-142,-124,-160,20,-16,128,2,20,-16,2,20,2,2,-16,-16 + $ ,128,38,-142,-268,-88,-124,-106,-142,1136,1010,1028,992,-124, + $ -160,1280,20,200,-160,20/ C 1 T(1,6,7,5,2,3,4) - DATA (CF(I, 19),I= 1, 6) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 19),I= 7, 12) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 19),I= 13, 18) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 19),I= 19, 24) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 19),I= 25, 30) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 19),I= 31, 36) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 19),I= 37, 42) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 19),I= 43, 48) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 19),I= 49, 54) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 19),I= 55, 60) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 19),I= 61, 66) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 19),I= 67, 72) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 19),I= 73, 78) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 19),I= 79, 84) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 19),I= 85, 90) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 19),I= 91, 96) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 19),I= 97,102) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 19),I=103,108) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 19),I=109,114) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 19),I=115,120) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ + DATA (CF(I),I=2008,2109) /4096,-1024,-1024,128,128,1280,2,-16,20 + $ ,2,128,-16,-142,1136,1010,1028,992,-124,38,-142,-268,-88,-124, + $ -106,1280,-160,-160,20,20,200,20,-160,-124,-142,-16,2,-124,992, + $ -268,-88,-448,56,-268,1010,884,-232,-268,884,-160,20,992,-124 + $ ,38,-106,-124,20,56,-124,2,20,-106,-124,884,-232,56,-88,-88 + $ ,1028,-232,272,-88,-232,20,200,-124,1028,-106,-88,-1024,128,128 + $ ,-16,-16,-160,128,-16,-16,2,2,20,-16,2,-160,20,-142,-124,2,20 + $ ,20,-124,-124,56/ C 1 T(1,7,2,5,6,3,4) - DATA (CF(I, 20),I= 1, 6) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 20),I= 7, 12) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 20),I= 13, 18) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 20),I= 19, 24) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 20),I= 25, 30) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 20),I= 31, 36) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 20),I= 37, 42) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 20),I= 43, 48) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 20),I= 49, 54) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 20),I= 55, 60) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 20),I= 61, 66) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 20),I= 67, 72) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 20),I= 73, 78) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 20),I= 79, 84) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 20),I= 85, 90) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 20),I= 91, 96) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 20),I= 97,102) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 20),I=103,108) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 20),I=109,114) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 20),I=115,120) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ + DATA (CF(I),I=2110,2210) /4096,128,1280,-1024,128,20,2,2,-16,-16 + $ ,128,38,-142,-268,-88,-124,-106,-142,1136,1010,1028,992,-124, + $ -160,1280,20,200,-160,20,-124,20,56,-124,2,20,-106,-124,884, + $ -232,56,-88,-88,1028,-232,272,-88,-232,20,200,-124,1028,-106, + $ -88,20,-160,-124,-142,-16,2,-124,992,-268,-88,-448,56,-268,1010 + $ ,884,-232,-268,884,-160,20,992,-124,38,-106,128,-1024,-16,-160 + $ ,128,-16,-16,128,2,20,-16,2,2,20,20,-124,-124,56,-16,2,-160,20, + $ -142,-124/ C 1 T(1,7,2,6,5,3,4) - DATA (CF(I, 21),I= 1, 6) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 21),I= 7, 12) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 21),I= 13, 18) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 21),I= 19, 24) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 21),I= 25, 30) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 21),I= 31, 36) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 21),I= 37, 42) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 21),I= 43, 48) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 21),I= 49, 54) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 21),I= 55, 60) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 21),I= 61, 66) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 21),I= 67, 72) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 21),I= 73, 78) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 21),I= 79, 84) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 21),I= 85, 90) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 21),I= 91, 96) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 21),I= 97,102) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 21),I=103,108) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 21),I=109,114) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 21),I=115,120) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ + DATA (CF(I),I=2211,2310) /4096,-1024,1280,128,20,-160,-124,-142, + $ -16,2,-124,992,-268,-88,-448,56,-268,1010,884,-232,-268,884, + $ -160,20,992,-124,38,-106,2,-16,20,2,128,-16,-142,1136,1010,1028 + $ ,992,-124,38,-142,-268,-88,-124,-106,1280,-160,-160,20,20,200 + $ ,56,-124,-124,20,20,2,-88,1028,-232,272,-88,-232,-106,-124,884, + $ -232,56,-88,200,20,-106,-88,-124,1028,128,-16,-1024,128,-160, + $ -16,-16,2,-160,20,-142,-124,128,-16,-16,2,2,20,20,2,-124,56,20, + $ -124/ C 1 T(1,7,5,2,6,3,4) - DATA (CF(I, 22),I= 1, 6) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 22),I= 7, 12) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 22),I= 13, 18) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 22),I= 19, 24) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 22),I= 25, 30) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 22),I= 31, 36) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 22),I= 37, 42) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 22),I= 43, 48) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 22),I= 49, 54) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 22),I= 55, 60) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 22),I= 61, 66) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 22),I= 67, 72) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 22),I= 73, 78) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 22),I= 79, 84) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 22),I= 85, 90) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 22),I= 91, 96) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 22),I= 97,102) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 22),I=103,108) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 22),I=109,114) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 22),I=115,120) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ + DATA (CF(I),I=2311,2409) /4096,128,-1024,-124,20,56,-124,2,20, + $ -106,-124,884,-232,56,-88,-88,1028,-232,272,-88,-232,20,200, + $ -124,1028,-106,-88,20,2,2,-16,-16,128,38,-142,-268,-88,-124, + $ -106,-142,1136,1010,1028,992,-124,-160,1280,20,200,-160,20,-124 + $ ,-142,20,-160,2,-16,-268,1010,884,-232,-268,884,-124,992,-268, + $ -88,-448,56,20,-160,38,-106,992,-124,-16,-160,128,-1024,-16,128 + $ ,2,20,20,-124,-124,56,-16,128,2,20,-16,2,2,-16,-142,-124,-160 + $ ,20/ C 1 T(1,7,5,6,2,3,4) - DATA (CF(I, 23),I= 1, 6) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 23),I= 7, 12) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 23),I= 13, 18) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 23),I= 19, 24) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 23),I= 25, 30) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 23),I= 31, 36) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 23),I= 37, 42) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 23),I= 43, 48) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 23),I= 49, 54) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 23),I= 55, 60) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 23),I= 61, 66) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 23),I= 67, 72) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 23),I= 73, 78) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 23),I= 79, 84) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 23),I= 85, 90) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 23),I= 91, 96) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 23),I= 97,102) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 23),I=103,108) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 23),I=109,114) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 23),I=115,120) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ + DATA (CF(I),I=2410,2507) /4096,-1024,-124,-142,20,-160,2,-16, + $ -268,1010,884,-232,-268,884,-124,992,-268,-88,-448,56,20,-160 + $ ,38,-106,992,-124,56,-124,-124,20,20,2,-88,1028,-232,272,-88, + $ -232,-106,-124,884,-232,56,-88,200,20,-106,-88,-124,1028,2,-16 + $ ,20,2,128,-16,-142,1136,1010,1028,992,-124,38,-142,-268,-88, + $ -124,-106,1280,-160,-160,20,20,200,-16,128,-160,-16,-1024,128,2 + $ ,-16,-142,-124,-160,20,20,2,-124,56,20,-124,128,-16,-16,2,2,20/ C 1 T(1,7,6,2,5,3,4) - DATA (CF(I, 24),I= 1, 6) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 24),I= 7, 12) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 24),I= 13, 18) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 24),I= 19, 24) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 24),I= 25, 30) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 24),I= 31, 36) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 24),I= 37, 42) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 24),I= 43, 48) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 24),I= 49, 54) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 24),I= 55, 60) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 24),I= 61, 66) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 24),I= 67, 72) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 24),I= 73, 78) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 24),I= 79, 84) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 24),I= 85, 90) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 24),I= 91, 96) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 24),I= 97,102) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 24),I=103,108) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 24),I=109,114) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 24),I=115,120) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ + DATA (CF(I),I=2508,2604) /4096,56,-124,-124,20,20,2,-88,1028, + $ -232,272,-88,-232,-106,-124,884,-232,56,-88,200,20,-106,-88, + $ -124,1028,-124,-142,20,-160,2,-16,-268,1010,884,-232,-268,884, + $ -124,992,-268,-88,-448,56,20,-160,38,-106,992,-124,20,2,2,-16, + $ -16,128,38,-142,-268,-88,-124,-106,-142,1136,1010,1028,992,-124 + $ ,-160,1280,20,200,-160,20,-160,-16,-16,128,128,-1024,20,2,-124 + $ ,56,20,-124,2,-16,-142,-124,-160,20,-16,128,2,20,-16,2/ C 1 T(1,7,6,5,2,3,4) - DATA (CF(I, 25),I= 1, 6) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 25),I= 7, 12) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 25),I= 13, 18) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 25),I= 19, 24) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 25),I= 25, 30) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 25),I= 31, 36) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 25),I= 37, 42) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 25),I= 43, 48) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 25),I= 49, 54) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 25),I= 55, 60) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 25),I= 61, 66) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 25),I= 67, 72) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 25),I= 73, 78) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 25),I= 79, 84) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 25),I= 85, 90) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 25),I= 91, 96) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 25),I= 97,102) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 25),I=103,108) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 25),I=109,114) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 25),I=115,120) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ + DATA (CF(I),I=2605,2700) /4096,-1024,-1024,128,128,1280,-1024 + $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160 + $ ,992,992,-448,1280,-160,-160,20,20,200,128,-16,-16,2,2,20,992, + $ -124,1136,-142,1028,1010,-124,-106,-142,38,-88,-268,-160,20,992 + $ ,-124,38,-106,-16,2,-160,20,-142,-124,-448,56,992,-124,-88,-268 + $ ,-268,884,1010,-268,-232,884,20,200,-124,1028,-106,-88,2,20,20, + $ -124,-124,56,56,-88,-124,-106,-232,884,-88,-232,1028,-88,272, + $ -232/ C 1 T(2,1,5,6,7,3,4) - DATA (CF(I, 26),I= 1, 6) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 26),I= 7, 12) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 26),I= 13, 18) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 26),I= 19, 24) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 26),I= 25, 30) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 26),I= 31, 36) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 26),I= 37, 42) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 26),I= 43, 48) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 26),I= 49, 54) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 26),I= 55, 60) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 26),I= 61, 66) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 26),I= 67, 72) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 26),I= 73, 78) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 26),I= 79, 84) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 26),I= 85, 90) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 26),I= 91, 96) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 26),I= 97,102) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 26),I=103,108) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 26),I=109,114) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 26),I=115,120) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ + DATA (CF(I),I=2701,2795) /4096,128,1280,-1024,128,128,-1024,-16, + $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136 + $ ,992,-160,1280,20,200,-160,20,-16,128,2,20,-16,2,-124,-106,-142 + $ ,38,-88,-268,992,-124,1136,-142,1028,1010,20,200,-124,1028,-106 + $ ,-88,2,20,20,-124,-124,56,56,-88,-124,-106,-232,884,-88,-232 + $ ,1028,-88,272,-232,-160,20,992,-124,38,-106,-16,2,-160,20,-142, + $ -124,-448,56,992,-124,-88,-268,-268,884,1010,-268,-232,884/ C 1 T(2,1,5,7,6,3,4) - DATA (CF(I, 27),I= 1, 6) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 27),I= 7, 12) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 27),I= 13, 18) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 27),I= 19, 24) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 27),I= 25, 30) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 27),I= 31, 36) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 27),I= 37, 42) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 27),I= 43, 48) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 27),I= 49, 54) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 27),I= 55, 60) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 27),I= 61, 66) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 27),I= 67, 72) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 27),I= 73, 78) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 27),I= 79, 84) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 27),I= 85, 90) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 27),I= 91, 96) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 27),I= 97,102) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 27),I=103,108) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 27),I=109,114) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 27),I=115,120) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ + DATA (CF(I),I=2796,2889) /4096,-1024,1280,128,128,-16,1280,-160 + $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992 + $ ,-160,20,992,-124,38,-106,-16,2,-160,20,-142,-124,-448,56,992, + $ -124,-88,-268,-268,884,1010,-268,-232,884,1280,-160,-160,20,20 + $ ,200,128,-16,-16,2,2,20,992,-124,1136,-142,1028,1010,-124,-106, + $ -142,38,-88,-268,200,20,-106,-88,-124,1028,20,2,-124,56,20,-124 + $ ,-88,-232,1028,-88,272,-232,56,-88,-124,-106,-232,884/ C 1 T(2,1,6,5,7,3,4) - DATA (CF(I, 28),I= 1, 6) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 28),I= 7, 12) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 28),I= 13, 18) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 28),I= 19, 24) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 28),I= 25, 30) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 28),I= 31, 36) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 28),I= 37, 42) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 28),I= 43, 48) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 28),I= 49, 54) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 28),I= 55, 60) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 28),I= 61, 66) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 28),I= 67, 72) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 28),I= 73, 78) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 28),I= 79, 84) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 28),I= 85, 90) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 28),I= 91, 96) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 28),I= 97,102) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 28),I=103,108) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 28),I=109,114) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 28),I=115,120) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ + DATA (CF(I),I=2890,2982) /4096,128,-1024,-16,-160,-160,992,992, + $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,20 + $ ,200,-124,1028,-106,-88,2,20,20,-124,-124,56,56,-88,-124,-106, + $ -232,884,-88,-232,1028,-88,272,-232,-160,1280,20,200,-160,20, + $ -16,128,2,20,-16,2,-124,-106,-142,38,-88,-268,992,-124,1136, + $ -142,1028,1010,20,-160,38,-106,992,-124,2,-16,-142,-124,-160,20 + $ ,-268,884,1010,-268,-232,884,-448,56,992,-124,-88,-268/ C 1 T(2,1,6,7,5,3,4) - DATA (CF(I, 29),I= 1, 6) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 29),I= 7, 12) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 29),I= 13, 18) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 29),I= 19, 24) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 29),I= 25, 30) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 29),I= 31, 36) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 29),I= 37, 42) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 29),I= 43, 48) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 29),I= 49, 54) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 29),I= 55, 60) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 29),I= 61, 66) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 29),I= 67, 72) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 29),I= 73, 78) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 29),I= 79, 84) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 29),I= 85, 90) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 29),I= 91, 96) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 29),I= 97,102) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 29),I=103,108) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 29),I=109,114) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 29),I=115,120) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ + DATA (CF(I),I=2983,3074) /4096,-1024,-16,128,1136,992,1280,-160, + $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,20,-160 + $ ,38,-106,992,-124,2,-16,-142,-124,-160,20,-268,884,1010,-268, + $ -232,884,-448,56,992,-124,-88,-268,200,20,-106,-88,-124,1028,20 + $ ,2,-124,56,20,-124,-88,-232,1028,-88,272,-232,56,-88,-124,-106, + $ -232,884,1280,-160,-160,20,20,200,128,-16,-16,2,2,20,992,-124 + $ ,1136,-142,1028,1010,-124,-106,-142,38,-88,-268/ C 1 T(2,1,7,5,6,3,4) - DATA (CF(I, 30),I= 1, 6) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 30),I= 7, 12) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 30),I= 13, 18) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 30),I= 19, 24) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 30),I= 25, 30) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 30),I= 31, 36) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 30),I= 37, 42) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 30),I= 43, 48) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 30),I= 49, 54) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 30),I= 55, 60) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 30),I= 61, 66) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 30),I= 67, 72) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 30),I= 73, 78) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 30),I= 79, 84) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 30),I= 85, 90) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 30),I= 91, 96) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 30),I= 97,102) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 30),I=103,108) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 30),I=109,114) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 30),I=115,120) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ + DATA (CF(I),I=3075,3165) /4096,-160,-16,992,-448,-160,992,-16 + $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16,200,20,-106, + $ -88,-124,1028,20,2,-124,56,20,-124,-88,-232,1028,-88,272,-232 + $ ,56,-88,-124,-106,-232,884,20,-160,38,-106,992,-124,2,-16,-142, + $ -124,-160,20,-268,884,1010,-268,-232,884,-448,56,992,-124,-88, + $ -268,-160,1280,20,200,-160,20,-16,128,2,20,-16,2,-124,-106,-142 + $ ,38,-88,-268,992,-124,1136,-142,1028,1010/ C 1 T(2,1,7,6,5,3,4) - DATA (CF(I, 31),I= 1, 6) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 31),I= 7, 12) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 31),I= 13, 18) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 31),I= 19, 24) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 31),I= 25, 30) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 31),I= 31, 36) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 31),I= 37, 42) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 31),I= 43, 48) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 31),I= 49, 54) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 31),I= 55, 60) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 31),I= 61, 66) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 31),I= 67, 72) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 31),I= 73, 78) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 31),I= 79, 84) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 31),I= 85, 90) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 31),I= 91, 96) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 31),I= 97,102) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 31),I=103,108) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 31),I=109,114) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 31),I=115,120) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ + DATA (CF(I),I=3166,3255) /4096,-1024,-1024,128,128,1280,1280, + $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992,128,-16,-16,2 + $ ,2,20,-1024,128,128,-16,-16,-160,-160,20,-16,2,-124,-142,20, + $ -124,2,20,56,-124,992,-124,-448,56,-268,-88,-160,20,-16,2,-124, + $ -142,992,-124,-160,20,-106,38,884,-232,-268,1010,884,-268,-124, + $ -106,56,-88,884,-232,20,-124,2,20,56,-124,-124,1028,20,200,-88, + $ -106,-232,272,-88,1028,-232,-88/ C 1 T(2,5,1,6,7,3,4) - DATA (CF(I, 32),I= 1, 6) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 32),I= 7, 12) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 32),I= 13, 18) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 32),I= 19, 24) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 32),I= 25, 30) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 32),I= 31, 36) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 32),I= 37, 42) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 32),I= 43, 48) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 32),I= 49, 54) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 32),I= 55, 60) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 32),I= 61, 66) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 32),I= 67, 72) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 32),I= 73, 78) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 32),I= 79, 84) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 32),I= 85, 90) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 32),I= 91, 96) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 32),I= 97,102) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 32),I=103,108) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 32),I=109,114) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 32),I=115,120) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ + DATA (CF(I),I=3256,3344) /4096,128,1280,-1024,128,-160,992,-16, + $ -160,-448,992,1280,-160,128,-16,992,1136,-16,128,2,20,-16,2,128 + $ ,-1024,-16,-160,128,-16,20,-124,2,20,56,-124,-160,20,-16,2,-124 + $ ,-142,-124,-106,56,-88,884,-232,20,-124,2,20,56,-124,-124,1028 + $ ,20,200,-88,-106,-232,272,-88,1028,-232,-88,992,-124,-448,56, + $ -268,-88,-160,20,-16,2,-124,-142,992,-124,-160,20,-106,38,884, + $ -232,-268,1010,884,-268/ C 1 T(2,5,1,7,6,3,4) - DATA (CF(I, 33),I= 1, 6) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 33),I= 7, 12) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 33),I= 13, 18) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 33),I= 19, 24) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 33),I= 25, 30) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 33),I= 31, 36) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 33),I= 37, 42) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 33),I= 43, 48) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 33),I= 49, 54) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 33),I= 55, 60) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 33),I= 61, 66) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 33),I= 67, 72) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 33),I= 73, 78) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 33),I= 79, 84) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 33),I= 85, 90) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 33),I= 91, 96) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 33),I= 97,102) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 33),I=103,108) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 33),I=109,114) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 33),I=115,120) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ + DATA (CF(I),I=3345,3432) /4096,-1024,1280,128,128,-16,-1024,128, + $ -160,-16,992,-448,-160,-16,992,-160,-16,2,-160,20,-142,-124,128 + $ ,-16,-1024,128,-160,-16,-16,2,128,-16,20,2,-124,56,20,2,-124,20 + $ ,1136,-142,992,-124,1010,1028,-16,2,128,-16,20,2,-160,20,1280, + $ -160,200,20,-268,-88,38,-142,-106,-124,1028,-88,-88,-232,-232 + $ ,272,-124,56,20,2,-124,20,-106,-88,200,20,1028,-124,884,-232, + $ -106,-124,-88,56/ C 1 T(2,5,6,1,7,3,4) - DATA (CF(I, 34),I= 1, 6) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 34),I= 7, 12) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 34),I= 13, 18) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 34),I= 19, 24) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 34),I= 25, 30) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 34),I= 31, 36) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 34),I= 37, 42) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 34),I= 43, 48) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 34),I= 49, 54) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 34),I= 55, 60) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 34),I= 61, 66) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 34),I= 67, 72) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 34),I= 73, 78) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 34),I= 79, 84) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 34),I= 85, 90) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 34),I= 91, 96) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 34),I= 97,102) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 34),I=103,108) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 34),I=109,114) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 34),I=115,120) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ + DATA (CF(I),I=3433,3519) /4096,128,-1024,-16,-160,128,-1024,-16 + $ ,128,1136,992,-16,128,-160,1280,2,20,20,-124,-124,56,-16,-160 + $ ,128,-1024,-16,128,2,20,-16,128,2,-16,-142,-124,2,-16,20,-160, + $ -142,38,-124,-106,-268,-88,2,20,-16,128,2,-16,20,200,-160,1280 + $ ,20,-160,1010,1028,-142,1136,-124,992,1010,-268,-268,884,884, + $ -232,-142,-124,2,-16,20,-160,38,-106,20,-160,-124,992,-268,-88, + $ -124,992,56,-448/ C 1 T(2,5,6,7,1,3,4) - DATA (CF(I, 35),I= 1, 6) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 35),I= 7, 12) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 35),I= 13, 18) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 35),I= 19, 24) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 35),I= 25, 30) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 35),I= 31, 36) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 35),I= 37, 42) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 35),I= 43, 48) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 35),I= 49, 54) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 35),I= 55, 60) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 35),I= 61, 66) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 35),I= 67, 72) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 35),I= 73, 78) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 35),I= 79, 84) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 35),I= 85, 90) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 35),I= 91, 96) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 35),I= 97,102) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 35),I=103,108) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 35),I=109,114) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 35),I=115,120) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ + DATA (CF(I),I=3520,3605) /4096,-1024,992,-448,-160,-16,992,-160 + $ ,128,-16,-1024,128,-160,-16,2,-16,-142,-124,-160,20,-16,128, + $ -160,-16,-1024,128,-124,56,20,2,-124,20,-16,2,128,-16,20,2,1028 + $ ,-88,-88,-232,-232,272,-124,56,20,2,-124,20,-106,-88,200,20 + $ ,1028,-124,884,-232,-106,-124,-88,56,1136,-142,992,-124,1010 + $ ,1028,-16,2,128,-16,20,2,-160,20,1280,-160,200,20,-268,-88,38, + $ -142,-106,-124/ C 1 T(2,5,7,1,6,3,4) - DATA (CF(I, 36),I= 1, 6) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 36),I= 7, 12) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 36),I= 13, 18) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 36),I= 19, 24) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 36),I= 25, 30) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 36),I= 31, 36) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 36),I= 37, 42) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 36),I= 43, 48) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 36),I= 49, 54) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 36),I= 55, 60) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 36),I= 61, 66) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 36),I= 67, 72) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 36),I= 73, 78) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 36),I= 79, 84) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 36),I= 85, 90) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 36),I= 91, 96) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 36),I= 97,102) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 36),I=103,108) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 36),I=109,114) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 36),I=115,120) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ + DATA (CF(I),I=3606,3690) /4096,1136,992,-16,128,-160,1280,-16, + $ -160,128,-1024,-16,128,20,2,-124,56,20,-124,-160,-16,-16,128 + $ ,128,-1024,-142,-124,2,-16,20,-160,2,20,-16,128,2,-16,1010,-268 + $ ,-268,884,884,-232,-142,-124,2,-16,20,-160,38,-106,20,-160,-124 + $ ,992,-268,-88,-124,992,56,-448,-142,38,-124,-106,-268,-88,2,20, + $ -16,128,2,-16,20,200,-160,1280,20,-160,1010,1028,-142,1136,-124 + $ ,992/ C 1 T(2,5,7,6,1,3,4) - DATA (CF(I, 37),I= 1, 6) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 37),I= 7, 12) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 37),I= 13, 18) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 37),I= 19, 24) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 37),I= 25, 30) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 37),I= 31, 36) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 37),I= 37, 42) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 37),I= 43, 48) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 37),I= 49, 54) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 37),I= 55, 60) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 37),I= 61, 66) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 37),I= 67, 72) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 37),I= 73, 78) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 37),I= 79, 84) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 37),I= 85, 90) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 37),I= 91, 96) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 37),I= 97,102) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 37),I=103,108) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 37),I=109,114) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 37),I=115,120) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ + DATA (CF(I),I=3691,3774) /4096,-1024,-1024,128,128,1280,992,-160 + $ ,-448,992,-16,-160,992,-124,-448,56,-268,-88,-160,20,-16,2,-124 + $ ,-142,992,-124,-160,20,-106,38,884,-232,-268,1010,884,-268,128, + $ -16,-16,2,2,20,-1024,128,128,-16,-16,-160,-160,20,-16,2,-124, + $ -142,20,-124,2,20,56,-124,-106,-124,884,-232,56,-88,-124,20,56, + $ -124,2,20,-232,272,-88,1028,-232,-88,-124,1028,20,200,-88,-106/ C 1 T(2,6,1,5,7,3,4) - DATA (CF(I, 38),I= 1, 6) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 38),I= 7, 12) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 38),I= 13, 18) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 38),I= 19, 24) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 38),I= 25, 30) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 38),I= 31, 36) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 38),I= 37, 42) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 38),I= 43, 48) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 38),I= 49, 54) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 38),I= 55, 60) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 38),I= 61, 66) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 38),I= 67, 72) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 38),I= 73, 78) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 38),I= 79, 84) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 38),I= 85, 90) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 38),I= 91, 96) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 38),I= 97,102) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 38),I=103,108) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 38),I=109,114) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 38),I=115,120) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ + DATA (CF(I),I=3775,3857) /4096,128,1280,-1024,128,-160,1280,992 + $ ,1136,128,-16,-124,-106,56,-88,884,-232,20,-124,2,20,56,-124, + $ -124,1028,20,200,-88,-106,-232,272,-88,1028,-232,-88,-16,128,2 + $ ,20,-16,2,128,-1024,-16,-160,128,-16,20,-124,2,20,56,-124,-160 + $ ,20,-16,2,-124,-142,-124,992,-268,-88,-448,56,20,-160,-124,-142 + $ ,-16,2,884,-232,-268,1010,884,-268,992,-124,-160,20,-106,38/ C 1 T(2,6,1,7,5,3,4) - DATA (CF(I, 39),I= 1, 6) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 39),I= 7, 12) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 39),I= 13, 18) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 39),I= 19, 24) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 39),I= 25, 30) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 39),I= 31, 36) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 39),I= 37, 42) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 39),I= 43, 48) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 39),I= 49, 54) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 39),I= 55, 60) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 39),I= 61, 66) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 39),I= 67, 72) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 39),I= 73, 78) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 39),I= 79, 84) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 39),I= 85, 90) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 39),I= 91, 96) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 39),I= 97,102) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 39),I=103,108) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 39),I=109,114) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 39),I=115,120) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ + DATA (CF(I),I=3858,3939) /4096,-1024,1280,128,-448,992,992,-160, + $ -160,-16,1136,-142,992,-124,1010,1028,-16,2,128,-16,20,2,-160 + $ ,20,1280,-160,200,20,-268,-88,38,-142,-106,-124,-16,2,-160,20, + $ -142,-124,128,-16,-1024,128,-160,-16,-16,2,128,-16,20,2,-124,56 + $ ,20,2,-124,20,-88,1028,-232,272,-88,-232,56,-124,-124,20,20,2 + $ ,884,-232,-106,-124,-88,56,-106,-88,200,20,1028,-124/ C 1 T(2,6,5,1,7,3,4) - DATA (CF(I, 40),I= 1, 6) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 40),I= 7, 12) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 40),I= 13, 18) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 40),I= 19, 24) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 40),I= 25, 30) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 40),I= 31, 36) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 40),I= 37, 42) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 40),I= 43, 48) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 40),I= 49, 54) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 40),I= 55, 60) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 40),I= 61, 66) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 40),I= 67, 72) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 40),I= 73, 78) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 40),I= 79, 84) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 40),I= 85, 90) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 40),I= 91, 96) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 40),I= 97,102) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 40),I=103,108) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 40),I=109,114) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 40),I=115,120) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ + DATA (CF(I),I=3940,4020) /4096,128,-1024,992,1136,-160,1280,-16 + $ ,128,-142,38,-124,-106,-268,-88,2,20,-16,128,2,-16,20,200,-160 + $ ,1280,20,-160,1010,1028,-142,1136,-124,992,2,20,20,-124,-124,56 + $ ,-16,-160,128,-1024,-16,128,2,20,-16,128,2,-16,-142,-124,2,-16 + $ ,20,-160,-268,1010,884,-232,-268,884,-124,-142,20,-160,2,-16, + $ -268,-88,-124,992,56,-448,38,-106,20,-160,-124,992/ C 1 T(2,6,5,7,1,3,4) - DATA (CF(I, 41),I= 1, 6) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 41),I= 7, 12) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 41),I= 13, 18) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 41),I= 19, 24) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 41),I= 25, 30) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 41),I= 31, 36) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 41),I= 37, 42) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 41),I= 43, 48) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 41),I= 49, 54) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 41),I= 55, 60) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 41),I= 61, 66) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 41),I= 67, 72) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 41),I= 73, 78) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 41),I= 79, 84) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 41),I= 85, 90) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 41),I= 91, 96) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 41),I= 97,102) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 41),I=103,108) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 41),I=109,114) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 41),I=115,120) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ + DATA (CF(I),I=4021,4100) /4096,-1024,-16,128,-160,-16,-1024,128 + $ ,1028,-88,-88,-232,-232,272,-124,56,20,2,-124,20,-106,-88,200 + $ ,20,1028,-124,884,-232,-106,-124,-88,56,2,-16,-142,-124,-160,20 + $ ,-16,128,-160,-16,-1024,128,-124,56,20,2,-124,20,-16,2,128,-16 + $ ,20,2,-142,1136,1010,1028,992,-124,2,-16,20,2,128,-16,-268,-88 + $ ,38,-142,-106,-124,-160,20,1280,-160,200,20/ C 1 T(2,6,7,1,5,3,4) - DATA (CF(I, 42),I= 1, 6) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 42),I= 7, 12) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 42),I= 13, 18) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 42),I= 19, 24) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 42),I= 25, 30) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 42),I= 31, 36) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 42),I= 37, 42) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 42),I= 43, 48) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 42),I= 49, 54) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 42),I= 55, 60) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 42),I= 61, 66) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 42),I= 67, 72) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 42),I= 73, 78) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 42),I= 79, 84) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 42),I= 85, 90) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 42),I= 91, 96) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 42),I= 97,102) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 42),I=103,108) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 42),I=109,114) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 42),I=115,120) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=4101,4179) /4096,-160,-16,-16,128,128,-1024,1010, + $ -268,-268,884,884,-232,-142,-124,2,-16,20,-160,38,-106,20,-160, + $ -124,992,-268,-88,-124,992,56,-448,20,2,-124,56,20,-124,-160, + $ -16,-16,128,128,-1024,-142,-124,2,-16,20,-160,2,20,-16,128,2, + $ -16,38,-142,-268,-88,-124,-106,20,2,2,-16,-16,128,1010,1028, + $ -142,1136,-124,992,20,200,-160,1280,20,-160/ C 1 T(2,6,7,5,1,3,4) - DATA (CF(I, 43),I= 1, 6) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 43),I= 7, 12) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 43),I= 13, 18) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 43),I= 19, 24) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 43),I= 25, 30) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 43),I= 31, 36) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 43),I= 37, 42) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 43),I= 43, 48) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 43),I= 49, 54) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 43),I= 55, 60) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 43),I= 61, 66) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 43),I= 67, 72) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 43),I= 73, 78) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 43),I= 79, 84) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 43),I= 85, 90) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 43),I= 91, 96) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 43),I= 97,102) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 43),I=103,108) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 43),I=109,114) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 43),I=115,120) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ + DATA (CF(I),I=4180,4257) /4096,-1024,-1024,128,128,1280,-124,992 + $ ,-268,-88,-448,56,20,-160,-124,-142,-16,2,884,-232,-268,1010 + $ ,884,-268,992,-124,-160,20,-106,38,-106,-124,884,-232,56,-88, + $ -124,20,56,-124,2,20,-232,272,-88,1028,-232,-88,-124,1028,20 + $ ,200,-88,-106,128,-16,-16,2,2,20,-1024,128,128,-16,-16,-160, + $ -160,20,-16,2,-124,-142,20,-124,2,20,56,-124/ C 1 T(2,7,1,5,6,3,4) - DATA (CF(I, 44),I= 1, 6) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 44),I= 7, 12) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 44),I= 13, 18) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 44),I= 19, 24) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 44),I= 25, 30) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 44),I= 31, 36) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 44),I= 37, 42) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 44),I= 43, 48) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 44),I= 49, 54) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 44),I= 55, 60) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 44),I= 61, 66) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 44),I= 67, 72) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 44),I= 73, 78) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 44),I= 79, 84) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 44),I= 85, 90) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 44),I= 91, 96) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 44),I= 97,102) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 44),I=103,108) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 44),I=109,114) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 44),I=115,120) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ + DATA (CF(I),I=4258,4334) /4096,128,1280,-1024,128,-106,-124,884, + $ -232,56,-88,-124,20,56,-124,2,20,-232,272,-88,1028,-232,-88, + $ -124,1028,20,200,-88,-106,-124,992,-268,-88,-448,56,20,-160, + $ -124,-142,-16,2,884,-232,-268,1010,884,-268,992,-124,-160,20, + $ -106,38,-16,128,2,20,-16,2,128,-1024,-16,-160,128,-16,20,-124,2 + $ ,20,56,-124,-160,20,-16,2,-124,-142/ C 1 T(2,7,1,6,5,3,4) - DATA (CF(I, 45),I= 1, 6) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 45),I= 7, 12) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 45),I= 13, 18) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 45),I= 19, 24) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 45),I= 25, 30) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 45),I= 31, 36) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 45),I= 37, 42) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 45),I= 43, 48) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 45),I= 49, 54) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 45),I= 55, 60) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 45),I= 61, 66) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 45),I= 67, 72) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 45),I= 73, 78) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 45),I= 79, 84) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 45),I= 85, 90) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 45),I= 91, 96) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 45),I= 97,102) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 45),I=103,108) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 45),I=109,114) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 45),I=115,120) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ + DATA (CF(I),I=4335,4410) /4096,-1024,1280,128,-142,1136,1010 + $ ,1028,992,-124,2,-16,20,2,128,-16,-268,-88,38,-142,-106,-124, + $ -160,20,1280,-160,200,20,-88,1028,-232,272,-88,-232,56,-124, + $ -124,20,20,2,884,-232,-106,-124,-88,56,-106,-88,200,20,1028, + $ -124,-16,2,-160,20,-142,-124,128,-16,-1024,128,-160,-16,-16,2 + $ ,128,-16,20,2,-124,56,20,2,-124,20/ C 1 T(2,7,5,1,6,3,4) - DATA (CF(I, 46),I= 1, 6) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 46),I= 7, 12) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 46),I= 13, 18) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 46),I= 19, 24) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 46),I= 25, 30) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 46),I= 31, 36) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 46),I= 37, 42) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 46),I= 43, 48) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 46),I= 49, 54) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 46),I= 55, 60) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 46),I= 61, 66) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 46),I= 67, 72) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 46),I= 73, 78) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 46),I= 79, 84) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 46),I= 85, 90) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 46),I= 91, 96) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 46),I= 97,102) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 46),I=103,108) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 46),I=109,114) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 46),I=115,120) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=4411,4485) /4096,128,-1024,38,-142,-268,-88,-124, + $ -106,20,2,2,-16,-16,128,1010,1028,-142,1136,-124,992,20,200, + $ -160,1280,20,-160,-268,1010,884,-232,-268,884,-124,-142,20,-160 + $ ,2,-16,-268,-88,-124,992,56,-448,38,-106,20,-160,-124,992,2,20 + $ ,20,-124,-124,56,-16,-160,128,-1024,-16,128,2,20,-16,128,2,-16, + $ -142,-124,2,-16,20,-160/ C 1 T(2,7,5,6,1,3,4) - DATA (CF(I, 47),I= 1, 6) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 47),I= 7, 12) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 47),I= 13, 18) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 47),I= 19, 24) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 47),I= 25, 30) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 47),I= 31, 36) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 47),I= 37, 42) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 47),I= 43, 48) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 47),I= 49, 54) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 47),I= 55, 60) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 47),I= 61, 66) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 47),I= 67, 72) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 47),I= 73, 78) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 47),I= 79, 84) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 47),I= 85, 90) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 47),I= 91, 96) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 47),I= 97,102) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 47),I=103,108) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 47),I=109,114) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 47),I=115,120) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ + DATA (CF(I),I=4486,4559) /4096,-1024,-88,1028,-232,272,-88,-232 + $ ,56,-124,-124,20,20,2,884,-232,-106,-124,-88,56,-106,-88,200,20 + $ ,1028,-124,-142,1136,1010,1028,992,-124,2,-16,20,2,128,-16,-268 + $ ,-88,38,-142,-106,-124,-160,20,1280,-160,200,20,2,-16,-142,-124 + $ ,-160,20,-16,128,-160,-16,-1024,128,-124,56,20,2,-124,20,-16,2 + $ ,128,-16,20,2/ C 1 T(2,7,6,1,5,3,4) - DATA (CF(I, 48),I= 1, 6) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 48),I= 7, 12) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 48),I= 13, 18) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 48),I= 19, 24) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 48),I= 25, 30) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 48),I= 31, 36) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 48),I= 37, 42) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 48),I= 43, 48) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 48),I= 49, 54) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 48),I= 55, 60) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 48),I= 61, 66) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 48),I= 67, 72) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 48),I= 73, 78) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 48),I= 79, 84) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 48),I= 85, 90) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 48),I= 91, 96) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 48),I= 97,102) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 48),I=103,108) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 48),I=109,114) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 48),I=115,120) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ + DATA (CF(I),I=4560,4632) /4096,-268,1010,884,-232,-268,884,-124, + $ -142,20,-160,2,-16,-268,-88,-124,992,56,-448,38,-106,20,-160, + $ -124,992,38,-142,-268,-88,-124,-106,20,2,2,-16,-16,128,1010 + $ ,1028,-142,1136,-124,992,20,200,-160,1280,20,-160,20,2,-124,56 + $ ,20,-124,-160,-16,-16,128,128,-1024,-142,-124,2,-16,20,-160,2 + $ ,20,-16,128,2,-16/ C 1 T(2,7,6,5,1,3,4) - DATA (CF(I, 49),I= 1, 6) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 49),I= 7, 12) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 49),I= 13, 18) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 49),I= 19, 24) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 49),I= 25, 30) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 49),I= 31, 36) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 49),I= 37, 42) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 49),I= 43, 48) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 49),I= 49, 54) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 49),I= 55, 60) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 49),I= 61, 66) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 49),I= 67, 72) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 49),I= 73, 78) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 49),I= 79, 84) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 49),I= 85, 90) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 49),I= 91, 96) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 49),I= 97,102) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 49),I=103,108) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 49),I=109,114) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 49),I=115,120) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ + DATA (CF(I),I=4633,4704) /4096,-1024,-1024,128,128,1280,-1024 + $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160 + $ ,992,992,-448,992,-124,-160,20,-106,38,-448,56,992,-124,-88, + $ -268,-16,2,-160,20,-142,-124,884,-268,-232,884,1010,-268,-124 + $ ,1028,20,200,-88,-106,56,-88,-124,-106,-232,884,2,20,20,-124, + $ -124,56,-232,-88,272,-232,1028,-88/ C 1 T(5,1,2,6,7,3,4) - DATA (CF(I, 50),I= 1, 6) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 50),I= 7, 12) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 50),I= 13, 18) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 50),I= 19, 24) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 50),I= 25, 30) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 50),I= 31, 36) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 50),I= 37, 42) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 50),I= 43, 48) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 50),I= 49, 54) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 50),I= 55, 60) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 50),I= 61, 66) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 50),I= 67, 72) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 50),I= 73, 78) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 50),I= 79, 84) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 50),I= 85, 90) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 50),I= 91, 96) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 50),I= 97,102) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 50),I=103,108) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 50),I=109,114) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 50),I=115,120) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ + DATA (CF(I),I=4705,4775) /4096,128,1280,-1024,128,128,-1024,-16, + $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136 + $ ,992,-124,1028,20,200,-88,-106,56,-88,-124,-106,-232,884,2,20 + $ ,20,-124,-124,56,-232,-88,272,-232,1028,-88,992,-124,-160,20, + $ -106,38,-448,56,992,-124,-88,-268,-16,2,-160,20,-142,-124,884, + $ -268,-232,884,1010,-268/ C 1 T(5,1,2,7,6,3,4) - DATA (CF(I, 51),I= 1, 6) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 51),I= 7, 12) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 51),I= 13, 18) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 51),I= 19, 24) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 51),I= 25, 30) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 51),I= 31, 36) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 51),I= 37, 42) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 51),I= 43, 48) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 51),I= 49, 54) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 51),I= 55, 60) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 51),I= 61, 66) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 51),I= 67, 72) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 51),I= 73, 78) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 51),I= 79, 84) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 51),I= 85, 90) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 51),I= 91, 96) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 51),I= 97,102) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 51),I=103,108) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 51),I=109,114) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 51),I=115,120) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ + DATA (CF(I),I=4776,4845) /4096,-1024,1280,128,128,-16,1280,-160 + $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992 + $ ,-160,20,1280,-160,200,20,992,-124,1136,-142,1028,1010,128,-16, + $ -16,2,2,20,-106,-124,-88,-268,-142,38,-106,-88,200,20,1028,-124 + $ ,-88,-232,1028,-88,272,-232,20,2,-124,56,20,-124,-88,56,-232 + $ ,884,-124,-106/ C 1 T(5,1,6,2,7,3,4) - DATA (CF(I, 52),I= 1, 6) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 52),I= 7, 12) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 52),I= 13, 18) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 52),I= 19, 24) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 52),I= 25, 30) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 52),I= 31, 36) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 52),I= 37, 42) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 52),I= 43, 48) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 52),I= 49, 54) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 52),I= 55, 60) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 52),I= 61, 66) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 52),I= 67, 72) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 52),I= 73, 78) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 52),I= 79, 84) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 52),I= 85, 90) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 52),I= 91, 96) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 52),I= 97,102) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 52),I=103,108) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 52),I=109,114) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 52),I=115,120) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ + DATA (CF(I),I=4846,4914) /4096,128,-1024,-16,-160,-160,992,992, + $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,20 + $ ,200,-160,1280,20,-160,-124,-106,-142,38,-88,-268,-16,128,2,20, + $ -16,2,-124,992,1028,1010,1136,-142,38,-106,20,-160,-124,992, + $ -268,884,1010,-268,-232,884,2,-16,-142,-124,-160,20,56,-448,-88 + $ ,-268,992,-124/ C 1 T(5,1,6,7,2,3,4) - DATA (CF(I, 53),I= 1, 6) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 53),I= 7, 12) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 53),I= 13, 18) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 53),I= 19, 24) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 53),I= 25, 30) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 53),I= 31, 36) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 53),I= 37, 42) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 53),I= 43, 48) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 53),I= 49, 54) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 53),I= 55, 60) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 53),I= 61, 66) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 53),I= 67, 72) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 53),I= 73, 78) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 53),I= 79, 84) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 53),I= 85, 90) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 53),I= 91, 96) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 53),I= 97,102) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 53),I=103,108) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 53),I=109,114) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 53),I=115,120) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ + DATA (CF(I),I=4915,4982) /4096,-1024,-16,128,1136,992,1280,-160, + $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,-106,-88 + $ ,200,20,1028,-124,-88,-232,1028,-88,272,-232,20,2,-124,56,20, + $ -124,-88,56,-232,884,-124,-106,-160,20,1280,-160,200,20,992, + $ -124,1136,-142,1028,1010,128,-16,-16,2,2,20,-106,-124,-88,-268, + $ -142,38/ C 1 T(5,1,7,2,6,3,4) - DATA (CF(I, 54),I= 1, 6) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 54),I= 7, 12) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 54),I= 13, 18) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 54),I= 19, 24) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 54),I= 25, 30) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 54),I= 31, 36) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 54),I= 37, 42) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 54),I= 43, 48) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 54),I= 49, 54) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 54),I= 55, 60) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 54),I= 61, 66) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 54),I= 67, 72) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 54),I= 73, 78) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 54),I= 79, 84) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 54),I= 85, 90) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 54),I= 91, 96) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 54),I= 97,102) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 54),I=103,108) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 54),I=109,114) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 54),I=115,120) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ + DATA (CF(I),I=4983,5049) /4096,-160,-16,992,-448,-160,992,-16 + $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16,38,-106,20, + $ -160,-124,992,-268,884,1010,-268,-232,884,2,-16,-142,-124,-160 + $ ,20,56,-448,-88,-268,992,-124,20,200,-160,1280,20,-160,-124, + $ -106,-142,38,-88,-268,-16,128,2,20,-16,2,-124,992,1028,1010 + $ ,1136,-142/ C 1 T(5,1,7,6,2,3,4) - DATA (CF(I, 55),I= 1, 6) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 55),I= 7, 12) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 55),I= 13, 18) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 55),I= 19, 24) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 55),I= 25, 30) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 55),I= 31, 36) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 55),I= 37, 42) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 55),I= 43, 48) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 55),I= 49, 54) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 55),I= 55, 60) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 55),I= 61, 66) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 55),I= 67, 72) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 55),I= 73, 78) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 55),I= 79, 84) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 55),I= 85, 90) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 55),I= 91, 96) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 55),I= 97,102) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 55),I=103,108) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 55),I=109,114) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 55),I=115,120) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ + DATA (CF(I),I=5050,5115) /4096,-1024,-1024,128,128,1280,1280, + $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992,-448,56,992, + $ -124,-88,-268,992,-124,-160,20,-106,38,-160,20,-16,2,-124,-142, + $ -232,884,884,-268,-268,1010,56,-88,-124,-106,-232,884,-124,1028 + $ ,20,200,-88,-106,20,-124,2,20,56,-124,272,-232,-232,-88,-88 + $ ,1028/ C 1 T(5,2,1,6,7,3,4) - DATA (CF(I, 56),I= 1, 6) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 56),I= 7, 12) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 56),I= 13, 18) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 56),I= 19, 24) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 56),I= 25, 30) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 56),I= 31, 36) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 56),I= 37, 42) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 56),I= 43, 48) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 56),I= 49, 54) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 56),I= 55, 60) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 56),I= 61, 66) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 56),I= 67, 72) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 56),I= 73, 78) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 56),I= 79, 84) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 56),I= 85, 90) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 56),I= 91, 96) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 56),I= 97,102) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 56),I=103,108) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 56),I=109,114) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 56),I=115,120) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ + DATA (CF(I),I=5116,5180) /4096,128,1280,-1024,128,-160,992,-16, + $ -160,-448,992,1280,-160,128,-16,992,1136,56,-88,-124,-106,-232 + $ ,884,-124,1028,20,200,-88,-106,20,-124,2,20,56,-124,272,-232, + $ -232,-88,-88,1028,-448,56,992,-124,-88,-268,992,-124,-160,20, + $ -106,38,-160,20,-16,2,-124,-142,-232,884,884,-268,-268,1010/ C 1 T(5,2,1,7,6,3,4) - DATA (CF(I, 57),I= 1, 6) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 57),I= 7, 12) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 57),I= 13, 18) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 57),I= 19, 24) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 57),I= 25, 30) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 57),I= 31, 36) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 57),I= 37, 42) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 57),I= 43, 48) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 57),I= 49, 54) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 57),I= 55, 60) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 57),I= 61, 66) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 57),I= 67, 72) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 57),I= 73, 78) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 57),I= 79, 84) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 57),I= 85, 90) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 57),I= 91, 96) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 57),I= 97,102) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 57),I=103,108) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 57),I=109,114) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 57),I=115,120) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ + DATA (CF(I),I=5181,5244) /4096,-1024,1280,128,128,-16,-1024,128, + $ -160,-16,992,-448,-160,-16,992,-160,992,-124,1136,-142,1028 + $ ,1010,-160,20,1280,-160,200,20,-16,2,128,-16,20,2,-88,-268,-106 + $ ,-124,38,-142,-88,-232,1028,-88,272,-232,-106,-88,200,20,1028, + $ -124,-124,56,20,2,-124,20,-232,884,-88,56,-106,-124/ C 1 T(5,2,6,1,7,3,4) - DATA (CF(I, 58),I= 1, 6) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 58),I= 7, 12) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 58),I= 13, 18) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 58),I= 19, 24) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 58),I= 25, 30) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 58),I= 31, 36) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 58),I= 37, 42) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 58),I= 43, 48) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 58),I= 49, 54) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 58),I= 55, 60) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 58),I= 61, 66) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 58),I= 67, 72) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 58),I= 73, 78) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 58),I= 79, 84) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 58),I= 85, 90) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 58),I= 91, 96) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 58),I= 97,102) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 58),I=103,108) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 58),I=109,114) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 58),I=115,120) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ + DATA (CF(I),I=5245,5307) /4096,128,-1024,-16,-160,128,-1024,-16 + $ ,128,1136,992,-16,128,-160,1280,-124,-106,-142,38,-88,-268,20 + $ ,200,-160,1280,20,-160,2,20,-16,128,2,-16,1028,1010,-124,992, + $ -142,1136,-268,884,1010,-268,-232,884,38,-106,20,-160,-124,992, + $ -142,-124,2,-16,20,-160,-88,-268,56,-448,-124,992/ C 1 T(5,2,6,7,1,3,4) - DATA (CF(I, 59),I= 1, 6) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 59),I= 7, 12) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 59),I= 13, 18) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 59),I= 19, 24) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 59),I= 25, 30) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 59),I= 31, 36) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 59),I= 37, 42) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 59),I= 43, 48) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 59),I= 49, 54) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 59),I= 55, 60) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 59),I= 61, 66) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 59),I= 67, 72) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 59),I= 73, 78) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 59),I= 79, 84) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 59),I= 85, 90) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 59),I= 91, 96) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 59),I= 97,102) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 59),I=103,108) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 59),I=109,114) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 59),I=115,120) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ + DATA (CF(I),I=5308,5369) /4096,-1024,992,-448,-160,-16,992,-160 + $ ,128,-16,-1024,128,-160,-16,-88,-232,1028,-88,272,-232,-106,-88 + $ ,200,20,1028,-124,-124,56,20,2,-124,20,-232,884,-88,56,-106, + $ -124,992,-124,1136,-142,1028,1010,-160,20,1280,-160,200,20,-16 + $ ,2,128,-16,20,2,-88,-268,-106,-124,38,-142/ C 1 T(5,2,7,1,6,3,4) - DATA (CF(I, 60),I= 1, 6) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 60),I= 7, 12) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 60),I= 13, 18) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 60),I= 19, 24) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 60),I= 25, 30) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 60),I= 31, 36) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 60),I= 37, 42) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 60),I= 43, 48) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 60),I= 49, 54) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 60),I= 55, 60) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 60),I= 61, 66) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 60),I= 67, 72) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 60),I= 73, 78) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 60),I= 79, 84) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 60),I= 85, 90) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 60),I= 91, 96) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 60),I= 97,102) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 60),I=103,108) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 60),I=109,114) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 60),I=115,120) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ + DATA (CF(I),I=5370,5430) /4096,1136,992,-16,128,-160,1280,-16, + $ -160,128,-1024,-16,128,-268,884,1010,-268,-232,884,38,-106,20, + $ -160,-124,992,-142,-124,2,-16,20,-160,-88,-268,56,-448,-124,992 + $ ,-124,-106,-142,38,-88,-268,20,200,-160,1280,20,-160,2,20,-16 + $ ,128,2,-16,1028,1010,-124,992,-142,1136/ C 1 T(5,2,7,6,1,3,4) - DATA (CF(I, 61),I= 1, 6) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 61),I= 7, 12) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 61),I= 13, 18) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 61),I= 19, 24) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 61),I= 25, 30) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 61),I= 31, 36) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 61),I= 37, 42) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 61),I= 43, 48) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 61),I= 49, 54) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 61),I= 55, 60) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 61),I= 61, 66) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 61),I= 67, 72) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 61),I= 73, 78) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 61),I= 79, 84) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 61),I= 85, 90) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 61),I= 91, 96) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 61),I= 97,102) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 61),I=103,108) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 61),I=109,114) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 61),I=115,120) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ + DATA (CF(I),I=5431,5490) /4096,-1024,-1024,128,128,1280,992,-160 + $ ,-448,992,-16,-160,-16,2,128,-16,20,2,-160,20,-16,2,-124,-142, + $ -1024,128,128,-16,-16,-160,-124,20,56,-124,2,20,884,-232,-106, + $ -124,-88,56,-232,272,-88,1028,-232,-88,-124,20,56,-124,2,20 + $ ,1028,-124,-88,-106,20,200/ C 1 T(5,6,1,2,7,3,4) - DATA (CF(I, 62),I= 1, 6) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 62),I= 7, 12) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 62),I= 13, 18) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 62),I= 19, 24) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 62),I= 25, 30) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 62),I= 31, 36) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 62),I= 37, 42) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 62),I= 43, 48) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 62),I= 49, 54) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 62),I= 55, 60) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 62),I= 61, 66) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 62),I= 67, 72) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 62),I= 73, 78) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 62),I= 79, 84) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 62),I= 85, 90) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 62),I= 91, 96) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 62),I= 97,102) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 62),I=103,108) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 62),I=109,114) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 62),I=115,120) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ + DATA (CF(I),I=5491,5549) /4096,128,1280,-1024,128,-160,1280,992 + $ ,1136,128,-16,2,20,-16,128,2,-16,20,-124,2,20,56,-124,128,-1024 + $ ,-16,-160,128,-16,20,-160,-124,-142,-16,2,-268,-88,-124,992,56, + $ -448,884,-232,-268,1010,884,-268,20,-160,-124,-142,-16,2,-124 + $ ,992,-106,38,-160,20/ C 1 T(5,6,1,7,2,3,4) - DATA (CF(I, 63),I= 1, 6) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 63),I= 7, 12) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 63),I= 13, 18) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 63),I= 19, 24) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 63),I= 25, 30) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 63),I= 31, 36) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 63),I= 37, 42) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 63),I= 43, 48) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 63),I= 49, 54) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 63),I= 55, 60) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 63),I= 61, 66) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 63),I= 67, 72) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 63),I= 73, 78) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 63),I= 79, 84) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 63),I= 85, 90) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 63),I= 91, 96) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 63),I= 97,102) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 63),I=103,108) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 63),I=109,114) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 63),I=115,120) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ + DATA (CF(I),I=5550,5607) /4096,-1024,1280,128,-448,992,992,-160, + $ -160,-16,-160,20,-16,2,-124,-142,-16,2,128,-16,20,2,128,-16, + $ -1024,128,-160,-16,56,-124,-124,20,20,2,-232,272,-88,1028,-232, + $ -88,884,-232,-106,-124,-88,56,56,-124,-124,20,20,2,-88,-106 + $ ,1028,-124,200,20/ C 1 T(5,6,2,1,7,3,4) - DATA (CF(I, 64),I= 1, 6) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 64),I= 7, 12) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 64),I= 13, 18) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 64),I= 19, 24) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 64),I= 25, 30) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 64),I= 31, 36) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 64),I= 37, 42) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 64),I= 43, 48) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 64),I= 49, 54) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 64),I= 55, 60) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 64),I= 61, 66) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 64),I= 67, 72) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 64),I= 73, 78) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 64),I= 79, 84) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 64),I= 85, 90) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 64),I= 91, 96) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 64),I= 97,102) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 64),I=103,108) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 64),I=109,114) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 64),I=115,120) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=5608,5664) /4096,128,-1024,992,1136,-160,1280,-16 + $ ,128,20,-124,2,20,56,-124,2,20,-16,128,2,-16,-16,-160,128,-1024 + $ ,-16,128,-124,-142,20,-160,2,-16,884,-232,-268,1010,884,-268, + $ -268,-88,-124,992,56,-448,-124,-142,20,-160,2,-16,-106,38,-124 + $ ,992,20,-160/ C 1 T(5,6,2,7,1,3,4) - DATA (CF(I, 65),I= 1, 6) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 65),I= 7, 12) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 65),I= 13, 18) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 65),I= 19, 24) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 65),I= 25, 30) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 65),I= 31, 36) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 65),I= 37, 42) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 65),I= 43, 48) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 65),I= 49, 54) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 65),I= 55, 60) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 65),I= 61, 66) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 65),I= 67, 72) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 65),I= 73, 78) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 65),I= 79, 84) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 65),I= 85, 90) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 65),I= 91, 96) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 65),I= 97,102) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 65),I=103,108) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 65),I=109,114) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 65),I=115,120) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ + DATA (CF(I),I=5665,5720) /4096,-1024,-16,128,-160,-16,-1024,128, + $ -142,-124,2,-16,20,-160,-124,56,20,2,-124,20,-16,128,-160,-16, + $ -1024,128,2,-16,20,2,128,-16,1010,1028,-142,1136,-124,992,-268, + $ -88,38,-142,-106,-124,2,-16,20,2,128,-16,20,-160,200,20,1280, + $ -160/ C 1 T(5,6,7,1,2,3,4) - DATA (CF(I, 66),I= 1, 6) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 66),I= 7, 12) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 66),I= 13, 18) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 66),I= 19, 24) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 66),I= 25, 30) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 66),I= 31, 36) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 66),I= 37, 42) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 66),I= 43, 48) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 66),I= 49, 54) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 66),I= 55, 60) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 66),I= 61, 66) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 66),I= 67, 72) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 66),I= 73, 78) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 66),I= 79, 84) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 66),I= 85, 90) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 66),I= 91, 96) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 66),I= 97,102) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 66),I=103,108) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 66),I=109,114) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 66),I=115,120) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ + DATA (CF(I),I=5721,5775) /4096,-160,-16,-16,128,128,-1024,-124 + $ ,56,20,2,-124,20,-142,-124,2,-16,20,-160,-160,-16,-16,128,128, + $ -1024,20,2,2,-16,-16,128,-268,-88,38,-142,-106,-124,1010,1028, + $ -142,1136,-124,992,20,2,2,-16,-16,128,200,20,20,-160,-160,1280/ C 1 T(5,6,7,2,1,3,4) - DATA (CF(I, 67),I= 1, 6) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 67),I= 7, 12) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 67),I= 13, 18) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 67),I= 19, 24) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 67),I= 25, 30) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 67),I= 31, 36) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 67),I= 37, 42) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 67),I= 43, 48) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 67),I= 49, 54) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 67),I= 55, 60) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 67),I= 61, 66) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 67),I= 67, 72) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 67),I= 73, 78) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 67),I= 79, 84) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 67),I= 85, 90) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 67),I= 91, 96) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 67),I= 97,102) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 67),I=103,108) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 67),I=109,114) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 67),I=115,120) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ + DATA (CF(I),I=5776,5829) /4096,-1024,-1024,128,128,1280,884,-232 + $ ,-106,-124,-88,56,-232,272,-88,1028,-232,-88,-124,20,56,-124,2 + $ ,20,1028,-124,-88,-106,20,200,-16,2,128,-16,20,2,-160,20,-16,2, + $ -124,-142,-1024,128,128,-16,-16,-160,-124,20,56,-124,2,20/ C 1 T(5,7,1,2,6,3,4) - DATA (CF(I, 68),I= 1, 6) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 68),I= 7, 12) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 68),I= 13, 18) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 68),I= 19, 24) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 68),I= 25, 30) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 68),I= 31, 36) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 68),I= 37, 42) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 68),I= 43, 48) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 68),I= 49, 54) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 68),I= 55, 60) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 68),I= 61, 66) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 68),I= 67, 72) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 68),I= 73, 78) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 68),I= 79, 84) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 68),I= 85, 90) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 68),I= 91, 96) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 68),I= 97,102) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 68),I=103,108) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 68),I=109,114) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 68),I=115,120) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ + DATA (CF(I),I=5830,5882) /4096,128,1280,-1024,128,-268,-88,-124 + $ ,992,56,-448,884,-232,-268,1010,884,-268,20,-160,-124,-142,-16 + $ ,2,-124,992,-106,38,-160,20,2,20,-16,128,2,-16,20,-124,2,20,56, + $ -124,128,-1024,-16,-160,128,-16,20,-160,-124,-142,-16,2/ C 1 T(5,7,1,6,2,3,4) - DATA (CF(I, 69),I= 1, 6) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 69),I= 7, 12) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 69),I= 13, 18) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 69),I= 19, 24) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 69),I= 25, 30) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 69),I= 31, 36) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 69),I= 37, 42) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 69),I= 43, 48) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 69),I= 49, 54) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 69),I= 55, 60) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 69),I= 61, 66) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 69),I= 67, 72) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 69),I= 73, 78) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 69),I= 79, 84) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 69),I= 85, 90) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 69),I= 91, 96) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 69),I= 97,102) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 69),I=103,108) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 69),I=109,114) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 69),I=115,120) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ + DATA (CF(I),I=5883,5934) /4096,-1024,1280,128,-232,272,-88,1028, + $ -232,-88,884,-232,-106,-124,-88,56,56,-124,-124,20,20,2,-88, + $ -106,1028,-124,200,20,-160,20,-16,2,-124,-142,-16,2,128,-16,20 + $ ,2,128,-16,-1024,128,-160,-16,56,-124,-124,20,20,2/ C 1 T(5,7,2,1,6,3,4) - DATA (CF(I, 70),I= 1, 6) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 70),I= 7, 12) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 70),I= 13, 18) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 70),I= 19, 24) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 70),I= 25, 30) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 70),I= 31, 36) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 70),I= 37, 42) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 70),I= 43, 48) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 70),I= 49, 54) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 70),I= 55, 60) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 70),I= 61, 66) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 70),I= 67, 72) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 70),I= 73, 78) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 70),I= 79, 84) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 70),I= 85, 90) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 70),I= 91, 96) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 70),I= 97,102) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 70),I=103,108) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 70),I=109,114) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 70),I=115,120) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ + DATA (CF(I),I=5935,5985) /4096,128,-1024,884,-232,-268,1010,884, + $ -268,-268,-88,-124,992,56,-448,-124,-142,20,-160,2,-16,-106,38, + $ -124,992,20,-160,20,-124,2,20,56,-124,2,20,-16,128,2,-16,-16, + $ -160,128,-1024,-16,128,-124,-142,20,-160,2,-16/ C 1 T(5,7,2,6,1,3,4) - DATA (CF(I, 71),I= 1, 6) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 71),I= 7, 12) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 71),I= 13, 18) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 71),I= 19, 24) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 71),I= 25, 30) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 71),I= 31, 36) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 71),I= 37, 42) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 71),I= 43, 48) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 71),I= 49, 54) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 71),I= 55, 60) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 71),I= 61, 66) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 71),I= 67, 72) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 71),I= 73, 78) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 71),I= 79, 84) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 71),I= 85, 90) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 71),I= 91, 96) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 71),I= 97,102) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 71),I=103,108) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 71),I=109,114) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 71),I=115,120) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ + DATA (CF(I),I=5986,6035) /4096,-1024,1010,1028,-142,1136,-124 + $ ,992,-268,-88,38,-142,-106,-124,2,-16,20,2,128,-16,20,-160,200 + $ ,20,1280,-160,-142,-124,2,-16,20,-160,-124,56,20,2,-124,20,-16 + $ ,128,-160,-16,-1024,128,2,-16,20,2,128,-16/ C 1 T(5,7,6,1,2,3,4) - DATA (CF(I, 72),I= 1, 6) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 72),I= 7, 12) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 72),I= 13, 18) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 72),I= 19, 24) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 72),I= 25, 30) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 72),I= 31, 36) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 72),I= 37, 42) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 72),I= 43, 48) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 72),I= 49, 54) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 72),I= 55, 60) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 72),I= 61, 66) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 72),I= 67, 72) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 72),I= 73, 78) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 72),I= 79, 84) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 72),I= 85, 90) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 72),I= 91, 96) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 72),I= 97,102) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 72),I=103,108) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 72),I=109,114) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 72),I=115,120) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=6036,6084) /4096,-268,-88,38,-142,-106,-124,1010 + $ ,1028,-142,1136,-124,992,20,2,2,-16,-16,128,200,20,20,-160,-160 + $ ,1280,-124,56,20,2,-124,20,-142,-124,2,-16,20,-160,-160,-16,-16 + $ ,128,128,-1024,20,2,2,-16,-16,128/ C 1 T(5,7,6,2,1,3,4) - DATA (CF(I, 73),I= 1, 6) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 73),I= 7, 12) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 73),I= 13, 18) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 73),I= 19, 24) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 73),I= 25, 30) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 73),I= 31, 36) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 73),I= 37, 42) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 73),I= 43, 48) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 73),I= 49, 54) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 73),I= 55, 60) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 73),I= 61, 66) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 73),I= 67, 72) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 73),I= 73, 78) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 73),I= 79, 84) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 73),I= 85, 90) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 73),I= 91, 96) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 73),I= 97,102) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 73),I=103,108) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 73),I=109,114) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 73),I=115,120) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ + DATA (CF(I),I=6085,6132) /4096,-1024,-1024,128,128,1280,-1024 + $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160 + $ ,992,992,-448,1028,-124,-88,-106,20,200,-88,56,-232,884,-124, + $ -106,-232,-88,272,-232,1028,-88,2,20,20,-124,-124,56/ C 1 T(6,1,2,5,7,3,4) - DATA (CF(I, 74),I= 1, 6) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 74),I= 7, 12) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 74),I= 13, 18) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 74),I= 19, 24) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 74),I= 25, 30) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 74),I= 31, 36) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 74),I= 37, 42) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 74),I= 43, 48) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 74),I= 49, 54) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 74),I= 55, 60) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 74),I= 61, 66) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 74),I= 67, 72) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 74),I= 73, 78) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 74),I= 79, 84) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 74),I= 85, 90) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 74),I= 91, 96) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 74),I= 97,102) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 74),I=103,108) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 74),I=109,114) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 74),I=115,120) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ + DATA (CF(I),I=6133,6179) /4096,128,1280,-1024,128,128,-1024,-16, + $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136 + $ ,992,-124,992,-106,38,-160,20,56,-448,-88,-268,992,-124,884, + $ -268,-232,884,1010,-268,-16,2,-160,20,-142,-124/ C 1 T(6,1,2,7,5,3,4) - DATA (CF(I, 75),I= 1, 6) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 75),I= 7, 12) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 75),I= 13, 18) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 75),I= 19, 24) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 75),I= 25, 30) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 75),I= 31, 36) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 75),I= 37, 42) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 75),I= 43, 48) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 75),I= 49, 54) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 75),I= 55, 60) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 75),I= 61, 66) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 75),I= 67, 72) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 75),I= 73, 78) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 75),I= 79, 84) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 75),I= 85, 90) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 75),I= 91, 96) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 75),I= 97,102) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 75),I=103,108) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 75),I=109,114) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 75),I=115,120) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ + DATA (CF(I),I=6180,6225) /4096,-1024,1280,128,128,-16,1280,-160 + $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992 + $ ,-88,-106,1028,-124,200,20,-232,-88,272,-232,1028,-88,-88,56, + $ -232,884,-124,-106,20,2,-124,56,20,-124/ C 1 T(6,1,5,2,7,3,4) - DATA (CF(I, 76),I= 1, 6) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 76),I= 7, 12) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 76),I= 13, 18) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 76),I= 19, 24) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 76),I= 25, 30) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 76),I= 31, 36) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 76),I= 37, 42) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 76),I= 43, 48) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 76),I= 49, 54) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 76),I= 55, 60) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 76),I= 61, 66) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 76),I= 67, 72) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 76),I= 73, 78) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 76),I= 79, 84) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 76),I= 85, 90) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 76),I= 91, 96) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 76),I= 97,102) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 76),I=103,108) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 76),I=109,114) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 76),I=115,120) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ + DATA (CF(I),I=6226,6270) /4096,128,-1024,-16,-160,-160,992,992, + $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,-106 + $ ,38,-124,992,20,-160,884,-268,-232,884,1010,-268,56,-448,-88, + $ -268,992,-124,2,-16,-142,-124,-160,20/ C 1 T(6,1,5,7,2,3,4) - DATA (CF(I, 77),I= 1, 6) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 77),I= 7, 12) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 77),I= 13, 18) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 77),I= 19, 24) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 77),I= 25, 30) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 77),I= 31, 36) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 77),I= 37, 42) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 77),I= 43, 48) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 77),I= 49, 54) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 77),I= 55, 60) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 77),I= 61, 66) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 77),I= 67, 72) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 77),I= 73, 78) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 77),I= 79, 84) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 77),I= 85, 90) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 77),I= 91, 96) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 77),I= 97,102) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 77),I=103,108) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 77),I=109,114) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 77),I=115,120) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ + DATA (CF(I),I=6271,6314) /4096,-1024,-16,128,1136,992,1280,-160, + $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,20,-160 + $ ,200,20,1280,-160,-124,992,1028,1010,1136,-142,-106,-124,-88, + $ -268,-142,38,128,-16,-16,2,2,20/ C 1 T(6,1,7,2,5,3,4) - DATA (CF(I, 78),I= 1, 6) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 78),I= 7, 12) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 78),I= 13, 18) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 78),I= 19, 24) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 78),I= 25, 30) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 78),I= 31, 36) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 78),I= 37, 42) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 78),I= 43, 48) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 78),I= 49, 54) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 78),I= 55, 60) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 78),I= 61, 66) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 78),I= 67, 72) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 78),I= 73, 78) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 78),I= 79, 84) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 78),I= 85, 90) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 78),I= 91, 96) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 78),I= 97,102) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 78),I=103,108) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 78),I=109,114) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 78),I=115,120) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ + DATA (CF(I),I=6315,6357) /4096,-160,-16,992,-448,-160,992,-16 + $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16,200,20,20, + $ -160,-160,1280,-106,-124,-88,-268,-142,38,-124,992,1028,1010 + $ ,1136,-142,-16,128,2,20,-16,2/ C 1 T(6,1,7,5,2,3,4) - DATA (CF(I, 79),I= 1, 6) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 79),I= 7, 12) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 79),I= 13, 18) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 79),I= 19, 24) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 79),I= 25, 30) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 79),I= 31, 36) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 79),I= 37, 42) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 79),I= 43, 48) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 79),I= 49, 54) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 79),I= 55, 60) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 79),I= 61, 66) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 79),I= 67, 72) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 79),I= 73, 78) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 79),I= 79, 84) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 79),I= 85, 90) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 79),I= 91, 96) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 79),I= 97,102) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 79),I=103,108) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 79),I=109,114) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 79),I=115,120) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ + DATA (CF(I),I=6358,6399) /4096,-1024,-1024,128,128,1280,1280, + $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992,-88,56,-232 + $ ,884,-124,-106,1028,-124,-88,-106,20,200,272,-232,-232,-88,-88 + $ ,1028,20,-124,2,20,56,-124/ C 1 T(6,2,1,5,7,3,4) - DATA (CF(I, 80),I= 1, 6) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 80),I= 7, 12) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 80),I= 13, 18) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 80),I= 19, 24) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 80),I= 25, 30) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 80),I= 31, 36) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 80),I= 37, 42) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 80),I= 43, 48) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 80),I= 49, 54) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 80),I= 55, 60) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 80),I= 61, 66) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 80),I= 67, 72) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 80),I= 73, 78) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 80),I= 79, 84) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 80),I= 85, 90) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 80),I= 91, 96) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 80),I= 97,102) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 80),I=103,108) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 80),I=109,114) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 80),I=115,120) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ + DATA (CF(I),I=6400,6440) /4096,128,1280,-1024,128,-160,992,-16, + $ -160,-448,992,1280,-160,128,-16,992,1136,56,-448,-88,-268,992, + $ -124,-124,992,-106,38,-160,20,-232,884,884,-268,-268,1010,-160 + $ ,20,-16,2,-124,-142/ C 1 T(6,2,1,7,5,3,4) - DATA (CF(I, 81),I= 1, 6) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 81),I= 7, 12) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 81),I= 13, 18) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 81),I= 19, 24) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 81),I= 25, 30) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 81),I= 31, 36) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 81),I= 37, 42) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 81),I= 43, 48) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 81),I= 49, 54) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 81),I= 55, 60) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 81),I= 61, 66) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 81),I= 67, 72) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 81),I= 73, 78) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 81),I= 79, 84) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 81),I= 85, 90) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 81),I= 91, 96) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 81),I= 97,102) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 81),I=103,108) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 81),I=109,114) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 81),I=115,120) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ + DATA (CF(I),I=6441,6480) /4096,-1024,1280,128,128,-16,-1024,128, + $ -160,-16,992,-448,-160,-16,992,-160,-232,-88,272,-232,1028,-88, + $ -88,-106,1028,-124,200,20,-232,884,-88,56,-106,-124,-124,56,20 + $ ,2,-124,20/ C 1 T(6,2,5,1,7,3,4) - DATA (CF(I, 82),I= 1, 6) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 82),I= 7, 12) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 82),I= 13, 18) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 82),I= 19, 24) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 82),I= 25, 30) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 82),I= 31, 36) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 82),I= 37, 42) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 82),I= 43, 48) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 82),I= 49, 54) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 82),I= 55, 60) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 82),I= 61, 66) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 82),I= 67, 72) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 82),I= 73, 78) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 82),I= 79, 84) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 82),I= 85, 90) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 82),I= 91, 96) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 82),I= 97,102) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 82),I=103,108) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 82),I=109,114) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 82),I=115,120) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=6481,6519) /4096,128,-1024,-16,-160,128,-1024,-16 + $ ,128,1136,992,-16,128,-160,1280,884,-268,-232,884,1010,-268, + $ -106,38,-124,992,20,-160,-88,-268,56,-448,-124,992,-142,-124,2, + $ -16,20,-160/ C 1 T(6,2,5,7,1,3,4) - DATA (CF(I, 83),I= 1, 6) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 83),I= 7, 12) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 83),I= 13, 18) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 83),I= 19, 24) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 83),I= 25, 30) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 83),I= 31, 36) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 83),I= 37, 42) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 83),I= 43, 48) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 83),I= 49, 54) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 83),I= 55, 60) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 83),I= 61, 66) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 83),I= 67, 72) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 83),I= 73, 78) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 83),I= 79, 84) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 83),I= 85, 90) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 83),I= 91, 96) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 83),I= 97,102) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 83),I=103,108) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 83),I=109,114) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 83),I=115,120) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ + DATA (CF(I),I=6520,6557) /4096,-1024,992,-448,-160,-16,992,-160 + $ ,128,-16,-1024,128,-160,-16,-124,992,1028,1010,1136,-142,20, + $ -160,200,20,1280,-160,-88,-268,-106,-124,38,-142,-16,2,128,-16 + $ ,20,2/ C 1 T(6,2,7,1,5,3,4) - DATA (CF(I, 84),I= 1, 6) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 84),I= 7, 12) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 84),I= 13, 18) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 84),I= 19, 24) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 84),I= 25, 30) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 84),I= 31, 36) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 84),I= 37, 42) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 84),I= 43, 48) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 84),I= 49, 54) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 84),I= 55, 60) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 84),I= 61, 66) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 84),I= 67, 72) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 84),I= 73, 78) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 84),I= 79, 84) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 84),I= 85, 90) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 84),I= 91, 96) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 84),I= 97,102) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 84),I=103,108) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 84),I=109,114) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 84),I=115,120) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ + DATA (CF(I),I=6558,6594) /4096,1136,992,-16,128,-160,1280,-16, + $ -160,128,-1024,-16,128,-106,-124,-88,-268,-142,38,200,20,20, + $ -160,-160,1280,1028,1010,-124,992,-142,1136,2,20,-16,128,2,-16/ C 1 T(6,2,7,5,1,3,4) - DATA (CF(I, 85),I= 1, 6) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 85),I= 7, 12) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 85),I= 13, 18) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 85),I= 19, 24) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 85),I= 25, 30) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 85),I= 31, 36) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 85),I= 37, 42) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 85),I= 43, 48) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 85),I= 49, 54) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 85),I= 55, 60) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 85),I= 61, 66) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 85),I= 67, 72) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 85),I= 73, 78) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 85),I= 79, 84) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 85),I= 85, 90) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 85),I= 91, 96) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 85),I= 97,102) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 85),I=103,108) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 85),I=109,114) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 85),I=115,120) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ + DATA (CF(I),I=6595,6630) /4096,-1024,-1024,128,128,1280,992,-160 + $ ,-448,992,-16,-160,-232,884,-88,56,-106,-124,272,-232,-232,-88, + $ -88,1028,1028,-124,-88,-106,20,200,-124,20,56,-124,2,20/ C 1 T(6,5,1,2,7,3,4) - DATA (CF(I, 86),I= 1, 6) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 86),I= 7, 12) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 86),I= 13, 18) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 86),I= 19, 24) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 86),I= 25, 30) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 86),I= 31, 36) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 86),I= 37, 42) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 86),I= 43, 48) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 86),I= 49, 54) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 86),I= 55, 60) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 86),I= 61, 66) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 86),I= 67, 72) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 86),I= 73, 78) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 86),I= 79, 84) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 86),I= 85, 90) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 86),I= 91, 96) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 86),I= 97,102) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 86),I=103,108) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 86),I=109,114) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 86),I=115,120) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ + DATA (CF(I),I=6631,6665) /4096,128,1280,-1024,128,-160,1280,992 + $ ,1136,128,-16,-88,-268,56,-448,-124,992,-232,884,884,-268,-268 + $ ,1010,-124,992,-106,38,-160,20,20,-160,-124,-142,-16,2/ C 1 T(6,5,1,7,2,3,4) - DATA (CF(I, 87),I= 1, 6) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 87),I= 7, 12) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 87),I= 13, 18) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 87),I= 19, 24) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 87),I= 25, 30) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 87),I= 31, 36) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 87),I= 37, 42) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 87),I= 43, 48) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 87),I= 49, 54) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 87),I= 55, 60) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 87),I= 61, 66) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 87),I= 67, 72) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 87),I= 73, 78) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 87),I= 79, 84) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 87),I= 85, 90) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 87),I= 91, 96) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 87),I= 97,102) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 87),I=103,108) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 87),I=109,114) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 87),I=115,120) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ + DATA (CF(I),I=6666,6699) /4096,-1024,1280,128,-448,992,992,-160, + $ -160,-16,272,-232,-232,-88,-88,1028,-232,884,-88,56,-106,-124, + $ -88,-106,1028,-124,200,20,56,-124,-124,20,20,2/ C 1 T(6,5,2,1,7,3,4) - DATA (CF(I, 88),I= 1, 6) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 88),I= 7, 12) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 88),I= 13, 18) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 88),I= 19, 24) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 88),I= 25, 30) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 88),I= 31, 36) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 88),I= 37, 42) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 88),I= 43, 48) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 88),I= 49, 54) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 88),I= 55, 60) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 88),I= 61, 66) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 88),I= 67, 72) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 88),I= 73, 78) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 88),I= 79, 84) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 88),I= 85, 90) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 88),I= 91, 96) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 88),I= 97,102) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 88),I=103,108) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 88),I=109,114) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 88),I=115,120) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ + DATA (CF(I),I=6700,6732) /4096,128,-1024,992,1136,-160,1280,-16 + $ ,128,-232,884,884,-268,-268,1010,-88,-268,56,-448,-124,992,-106 + $ ,38,-124,992,20,-160,-124,-142,20,-160,2,-16/ C 1 T(6,5,2,7,1,3,4) - DATA (CF(I, 89),I= 1, 6) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 89),I= 7, 12) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 89),I= 13, 18) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 89),I= 19, 24) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 89),I= 25, 30) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 89),I= 31, 36) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 89),I= 37, 42) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 89),I= 43, 48) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 89),I= 49, 54) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 89),I= 55, 60) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 89),I= 61, 66) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 89),I= 67, 72) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 89),I= 73, 78) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 89),I= 79, 84) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 89),I= 85, 90) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 89),I= 91, 96) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 89),I= 97,102) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 89),I=103,108) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 89),I=109,114) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 89),I=115,120) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ + DATA (CF(I),I=6733,6764) /4096,-1024,-16,128,-160,-16,-1024,128 + $ ,1028,1010,-124,992,-142,1136,-88,-268,-106,-124,38,-142,20, + $ -160,200,20,1280,-160,2,-16,20,2,128,-16/ C 1 T(6,5,7,1,2,3,4) - DATA (CF(I, 90),I= 1, 6) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 90),I= 7, 12) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 90),I= 13, 18) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 90),I= 19, 24) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 90),I= 25, 30) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 90),I= 31, 36) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 90),I= 37, 42) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 90),I= 43, 48) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 90),I= 49, 54) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 90),I= 55, 60) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 90),I= 61, 66) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 90),I= 67, 72) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 90),I= 73, 78) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 90),I= 79, 84) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 90),I= 85, 90) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 90),I= 91, 96) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 90),I= 97,102) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 90),I=103,108) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 90),I=109,114) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 90),I=115,120) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=6765,6795) /4096,-160,-16,-16,128,128,-1024,-88, + $ -268,-106,-124,38,-142,1028,1010,-124,992,-142,1136,200,20,20, + $ -160,-160,1280,20,2,2,-16,-16,128/ C 1 T(6,5,7,2,1,3,4) - DATA (CF(I, 91),I= 1, 6) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 91),I= 7, 12) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 91),I= 13, 18) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 91),I= 19, 24) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 91),I= 25, 30) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 91),I= 31, 36) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 91),I= 37, 42) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 91),I= 43, 48) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 91),I= 49, 54) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 91),I= 55, 60) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 91),I= 61, 66) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 91),I= 67, 72) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 91),I= 73, 78) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 91),I= 79, 84) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 91),I= 85, 90) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 91),I= 91, 96) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 91),I= 97,102) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 91),I=103,108) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 91),I=109,114) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 91),I=115,120) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=6796,6825) /4096,-1024,-1024,128,128,1280,2,-16,20 + $ ,2,128,-16,20,-160,-124,-142,-16,2,-124,20,56,-124,2,20,-1024 + $ ,128,128,-16,-16,-160/ C 1 T(6,7,1,2,5,3,4) - DATA (CF(I, 92),I= 1, 6) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 92),I= 7, 12) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 92),I= 13, 18) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 92),I= 19, 24) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 92),I= 25, 30) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 92),I= 31, 36) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 92),I= 37, 42) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 92),I= 43, 48) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 92),I= 49, 54) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 92),I= 55, 60) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 92),I= 61, 66) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 92),I= 67, 72) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 92),I= 73, 78) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 92),I= 79, 84) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 92),I= 85, 90) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 92),I= 91, 96) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 92),I= 97,102) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 92),I=103,108) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 92),I=109,114) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 92),I=115,120) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ + DATA (CF(I),I=6826,6854) /4096,128,1280,-1024,128,20,2,2,-16,-16 + $ ,128,-124,20,56,-124,2,20,20,-160,-124,-142,-16,2,128,-1024,-16 + $ ,-160,128,-16/ C 1 T(6,7,1,5,2,3,4) - DATA (CF(I, 93),I= 1, 6) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 93),I= 7, 12) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 93),I= 13, 18) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 93),I= 19, 24) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 93),I= 25, 30) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 93),I= 31, 36) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 93),I= 37, 42) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 93),I= 43, 48) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 93),I= 49, 54) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 93),I= 55, 60) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 93),I= 61, 66) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 93),I= 67, 72) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 93),I= 73, 78) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 93),I= 79, 84) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 93),I= 85, 90) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 93),I= 91, 96) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 93),I= 97,102) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 93),I=103,108) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 93),I=109,114) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 93),I=115,120) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ + DATA (CF(I),I=6855,6882) /4096,-1024,1280,128,20,-160,-124,-142, + $ -16,2,2,-16,20,2,128,-16,56,-124,-124,20,20,2,128,-16,-1024,128 + $ ,-160,-16/ C 1 T(6,7,2,1,5,3,4) - DATA (CF(I, 94),I= 1, 6) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 94),I= 7, 12) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 94),I= 13, 18) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 94),I= 19, 24) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 94),I= 25, 30) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 94),I= 31, 36) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 94),I= 37, 42) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 94),I= 43, 48) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 94),I= 49, 54) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 94),I= 55, 60) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 94),I= 61, 66) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 94),I= 67, 72) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 94),I= 73, 78) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 94),I= 79, 84) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 94),I= 85, 90) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 94),I= 91, 96) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 94),I= 97,102) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 94),I=103,108) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 94),I=109,114) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 94),I=115,120) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=6883,6909) /4096,128,-1024,-124,20,56,-124,2,20,20 + $ ,2,2,-16,-16,128,-124,-142,20,-160,2,-16,-16,-160,128,-1024,-16 + $ ,128/ C 1 T(6,7,2,5,1,3,4) - DATA (CF(I, 95),I= 1, 6) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 95),I= 7, 12) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 95),I= 13, 18) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 95),I= 19, 24) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 95),I= 25, 30) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 95),I= 31, 36) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 95),I= 37, 42) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 95),I= 43, 48) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 95),I= 49, 54) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 95),I= 55, 60) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 95),I= 61, 66) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 95),I= 67, 72) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 95),I= 73, 78) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 95),I= 79, 84) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 95),I= 85, 90) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 95),I= 91, 96) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 95),I= 97,102) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 95),I=103,108) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 95),I=109,114) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 95),I=115,120) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=6910,6935) /4096,-1024,-124,-142,20,-160,2,-16,56, + $ -124,-124,20,20,2,2,-16,20,2,128,-16,-16,128,-160,-16,-1024,128/ C 1 T(6,7,5,1,2,3,4) - DATA (CF(I, 96),I= 1, 6) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 96),I= 7, 12) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 96),I= 13, 18) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 96),I= 19, 24) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 96),I= 25, 30) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 96),I= 31, 36) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 96),I= 37, 42) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 96),I= 43, 48) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 96),I= 49, 54) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 96),I= 55, 60) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 96),I= 61, 66) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 96),I= 67, 72) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 96),I= 73, 78) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 96),I= 79, 84) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 96),I= 85, 90) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 96),I= 91, 96) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 96),I= 97,102) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 96),I=103,108) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 96),I=109,114) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 96),I=115,120) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ + DATA (CF(I),I=6936,6960) /4096,56,-124,-124,20,20,2,-124,-142,20 + $ ,-160,2,-16,20,2,2,-16,-16,128,-160,-16,-16,128,128,-1024/ C 1 T(6,7,5,2,1,3,4) - DATA (CF(I, 97),I= 1, 6) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 97),I= 7, 12) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 97),I= 13, 18) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 97),I= 19, 24) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 97),I= 25, 30) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 97),I= 31, 36) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 97),I= 37, 42) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 97),I= 43, 48) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 97),I= 49, 54) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 97),I= 55, 60) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 97),I= 61, 66) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 97),I= 67, 72) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 97),I= 73, 78) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 97),I= 79, 84) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 97),I= 85, 90) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 97),I= 91, 96) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 97),I= 97,102) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 97),I=103,108) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 97),I=109,114) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 97),I=115,120) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ + DATA (CF(I),I=6961,6984) /4096,-1024,-1024,128,128,1280,-1024 + $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160 + $ ,992,992,-448/ C 1 T(7,1,2,5,6,3,4) - DATA (CF(I, 98),I= 1, 6) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 98),I= 7, 12) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 98),I= 13, 18) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 98),I= 19, 24) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 98),I= 25, 30) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 98),I= 31, 36) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 98),I= 37, 42) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 98),I= 43, 48) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 98),I= 49, 54) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 98),I= 55, 60) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 98),I= 61, 66) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 98),I= 67, 72) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 98),I= 73, 78) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 98),I= 79, 84) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 98),I= 85, 90) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 98),I= 91, 96) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 98),I= 97,102) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 98),I=103,108) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 98),I=109,114) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 98),I=115,120) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ + DATA (CF(I),I=6985,7007) /4096,128,1280,-1024,128,128,-1024,-16, + $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136 + $ ,992/ C 1 T(7,1,2,6,5,3,4) - DATA (CF(I, 99),I= 1, 6) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 99),I= 7, 12) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 99),I= 13, 18) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 99),I= 19, 24) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 99),I= 25, 30) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 99),I= 31, 36) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 99),I= 37, 42) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 99),I= 43, 48) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 99),I= 49, 54) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 99),I= 55, 60) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 99),I= 61, 66) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 99),I= 67, 72) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 99),I= 73, 78) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 99),I= 79, 84) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 99),I= 85, 90) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 99),I= 91, 96) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 99),I= 97,102) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 99),I=103,108) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 99),I=109,114) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 99),I=115,120) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ + DATA (CF(I),I=7008,7029) /4096,-1024,1280,128,128,-16,1280,-160 + $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992/ C 1 T(7,1,5,2,6,3,4) - DATA (CF(I,100),I= 1, 6) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,100),I= 7, 12) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,100),I= 13, 18) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,100),I= 19, 24) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,100),I= 25, 30) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,100),I= 31, 36) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,100),I= 37, 42) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,100),I= 43, 48) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I,100),I= 49, 54) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,100),I= 55, 60) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,100),I= 61, 66) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,100),I= 67, 72) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I,100),I= 73, 78) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,100),I= 79, 84) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,100),I= 85, 90) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,100),I= 91, 96) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,100),I= 97,102) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,100),I=103,108) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I,100),I=109,114) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,100),I=115,120) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ + DATA (CF(I),I=7030,7050) /4096,128,-1024,-16,-160,-160,992,992, + $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160/ C 1 T(7,1,5,6,2,3,4) - DATA (CF(I,101),I= 1, 6) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,101),I= 7, 12) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,101),I= 13, 18) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,101),I= 19, 24) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,101),I= 25, 30) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,101),I= 31, 36) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,101),I= 37, 42) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,101),I= 43, 48) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,101),I= 49, 54) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,101),I= 55, 60) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,101),I= 61, 66) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,101),I= 67, 72) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I,101),I= 73, 78) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,101),I= 79, 84) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,101),I= 85, 90) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,101),I= 91, 96) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,101),I= 97,102) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I,101),I=103,108) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,101),I=109,114) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,101),I=115,120) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=7051,7070) /4096,-1024,-16,128,1136,992,1280,-160, + $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160/ C 1 T(7,1,6,2,5,3,4) - DATA (CF(I,102),I= 1, 6) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,102),I= 7, 12) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,102),I= 13, 18) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,102),I= 19, 24) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,102),I= 25, 30) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,102),I= 31, 36) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,102),I= 37, 42) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,102),I= 43, 48) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I,102),I= 49, 54) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,102),I= 55, 60) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,102),I= 61, 66) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,102),I= 67, 72) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,102),I= 73, 78) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,102),I= 79, 84) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,102),I= 85, 90) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,102),I= 91, 96) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I,102),I= 97,102) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I,102),I=103,108) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,102),I=109,114) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,102),I=115,120) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ + DATA (CF(I),I=7071,7089) /4096,-160,-16,992,-448,-160,992,-16 + $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16/ C 1 T(7,1,6,5,2,3,4) - DATA (CF(I,103),I= 1, 6) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,103),I= 7, 12) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,103),I= 13, 18) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,103),I= 19, 24) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,103),I= 25, 30) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,103),I= 31, 36) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,103),I= 37, 42) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,103),I= 43, 48) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,103),I= 49, 54) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,103),I= 55, 60) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,103),I= 61, 66) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,103),I= 67, 72) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I,103),I= 73, 78) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,103),I= 79, 84) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,103),I= 85, 90) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,103),I= 91, 96) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I,103),I= 97,102) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,103),I=103,108) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I,103),I=109,114) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I,103),I=115,120) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ + DATA (CF(I),I=7090,7107) /4096,-1024,-1024,128,128,1280,1280, + $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992/ C 1 T(7,2,1,5,6,3,4) - DATA (CF(I,104),I= 1, 6) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,104),I= 7, 12) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,104),I= 13, 18) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,104),I= 19, 24) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I,104),I= 25, 30) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,104),I= 31, 36) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,104),I= 37, 42) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,104),I= 43, 48) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,104),I= 49, 54) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,104),I= 55, 60) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,104),I= 61, 66) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,104),I= 67, 72) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I,104),I= 73, 78) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,104),I= 79, 84) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,104),I= 85, 90) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,104),I= 91, 96) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I,104),I= 97,102) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,104),I=103,108) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I,104),I=109,114) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,104),I=115,120) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ + DATA (CF(I),I=7108,7124) /4096,128,1280,-1024,128,-160,992,-16, + $ -160,-448,992,1280,-160,128,-16,992,1136/ C 1 T(7,2,1,6,5,3,4) - DATA (CF(I,105),I= 1, 6) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,105),I= 7, 12) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,105),I= 13, 18) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,105),I= 19, 24) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,105),I= 25, 30) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,105),I= 31, 36) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,105),I= 37, 42) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,105),I= 43, 48) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,105),I= 49, 54) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,105),I= 55, 60) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,105),I= 61, 66) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,105),I= 67, 72) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,105),I= 73, 78) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,105),I= 79, 84) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,105),I= 85, 90) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,105),I= 91, 96) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I,105),I= 97,102) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I,105),I=103,108) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I,105),I=109,114) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,105),I=115,120) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ + DATA (CF(I),I=7125,7140) /4096,-1024,1280,128,128,-16,-1024,128, + $ -160,-16,992,-448,-160,-16,992,-160/ C 1 T(7,2,5,1,6,3,4) - DATA (CF(I,106),I= 1, 6) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,106),I= 7, 12) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,106),I= 13, 18) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,106),I= 19, 24) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I,106),I= 25, 30) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,106),I= 31, 36) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,106),I= 37, 42) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,106),I= 43, 48) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,106),I= 49, 54) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,106),I= 55, 60) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,106),I= 61, 66) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,106),I= 67, 72) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,106),I= 73, 78) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,106),I= 79, 84) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,106),I= 85, 90) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,106),I= 91, 96) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,106),I= 97,102) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I,106),I=103,108) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,106),I=109,114) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,106),I=115,120) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ + DATA (CF(I),I=7141,7155) /4096,128,-1024,-16,-160,128,-1024,-16 + $ ,128,1136,992,-16,128,-160,1280/ C 1 T(7,2,5,6,1,3,4) - DATA (CF(I,107),I= 1, 6) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,107),I= 7, 12) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,107),I= 13, 18) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,107),I= 19, 24) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,107),I= 25, 30) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,107),I= 31, 36) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,107),I= 37, 42) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,107),I= 43, 48) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,107),I= 49, 54) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,107),I= 55, 60) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,107),I= 61, 66) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,107),I= 67, 72) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I,107),I= 73, 78) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,107),I= 79, 84) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,107),I= 85, 90) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,107),I= 91, 96) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,107),I= 97,102) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,107),I=103,108) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I,107),I=109,114) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I,107),I=115,120) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ + DATA (CF(I),I=7156,7169) /4096,-1024,992,-448,-160,-16,992,-160 + $ ,128,-16,-1024,128,-160,-16/ C 1 T(7,2,6,1,5,3,4) - DATA (CF(I,108),I= 1, 6) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,108),I= 7, 12) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,108),I= 13, 18) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,108),I= 19, 24) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I,108),I= 25, 30) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,108),I= 31, 36) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,108),I= 37, 42) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,108),I= 43, 48) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,108),I= 49, 54) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,108),I= 55, 60) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,108),I= 61, 66) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,108),I= 67, 72) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,108),I= 73, 78) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,108),I= 79, 84) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,108),I= 85, 90) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,108),I= 91, 96) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,108),I= 97,102) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,108),I=103,108) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I,108),I=109,114) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,108),I=115,120) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=7170,7182) /4096,1136,992,-16,128,-160,1280,-16, + $ -160,128,-1024,-16,128/ C 1 T(7,2,6,5,1,3,4) - DATA (CF(I,109),I= 1, 6) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,109),I= 7, 12) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,109),I= 13, 18) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,109),I= 19, 24) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,109),I= 25, 30) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,109),I= 31, 36) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,109),I= 37, 42) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,109),I= 43, 48) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I,109),I= 49, 54) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,109),I= 55, 60) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,109),I= 61, 66) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,109),I= 67, 72) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,109),I= 73, 78) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,109),I= 79, 84) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,109),I= 85, 90) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,109),I= 91, 96) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,109),I= 97,102) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,109),I=103,108) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I,109),I=109,114) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I,109),I=115,120) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=7183,7194) /4096,-1024,-1024,128,128,1280,992,-160 + $ ,-448,992,-16,-160/ C 1 T(7,5,1,2,6,3,4) - DATA (CF(I,110),I= 1, 6) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,110),I= 7, 12) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,110),I= 13, 18) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,110),I= 19, 24) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,110),I= 25, 30) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,110),I= 31, 36) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,110),I= 37, 42) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,110),I= 43, 48) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I,110),I= 49, 54) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,110),I= 55, 60) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,110),I= 61, 66) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,110),I= 67, 72) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,110),I= 73, 78) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,110),I= 79, 84) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,110),I= 85, 90) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,110),I= 91, 96) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,110),I= 97,102) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,110),I=103,108) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,110),I=109,114) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I,110),I=115,120) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ + DATA (CF(I),I=7195,7205) /4096,128,1280,-1024,128,-160,1280,992 + $ ,1136,128,-16/ C 1 T(7,5,1,6,2,3,4) - DATA (CF(I,111),I= 1, 6) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,111),I= 7, 12) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,111),I= 13, 18) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,111),I= 19, 24) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I,111),I= 25, 30) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,111),I= 31, 36) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,111),I= 37, 42) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,111),I= 43, 48) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,111),I= 49, 54) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,111),I= 55, 60) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,111),I= 61, 66) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,111),I= 67, 72) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,111),I= 73, 78) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,111),I= 79, 84) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,111),I= 85, 90) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,111),I= 91, 96) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,111),I= 97,102) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I,111),I=103,108) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,111),I=109,114) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I,111),I=115,120) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ + DATA (CF(I),I=7206,7215) /4096,-1024,1280,128,-448,992,992,-160, + $ -160,-16/ C 1 T(7,5,2,1,6,3,4) - DATA (CF(I,112),I= 1, 6) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,112),I= 7, 12) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,112),I= 13, 18) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,112),I= 19, 24) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I,112),I= 25, 30) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,112),I= 31, 36) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,112),I= 37, 42) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,112),I= 43, 48) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,112),I= 49, 54) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,112),I= 55, 60) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,112),I= 61, 66) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,112),I= 67, 72) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,112),I= 73, 78) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,112),I= 79, 84) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,112),I= 85, 90) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,112),I= 91, 96) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,112),I= 97,102) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,112),I=103,108) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,112),I=109,114) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,112),I=115,120) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=7216,7224) /4096,128,-1024,992,1136,-160,1280,-16 + $ ,128/ C 1 T(7,5,2,6,1,3,4) - DATA (CF(I,113),I= 1, 6) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,113),I= 7, 12) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,113),I= 13, 18) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,113),I= 19, 24) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,113),I= 25, 30) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,113),I= 31, 36) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,113),I= 37, 42) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,113),I= 43, 48) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I,113),I= 49, 54) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,113),I= 55, 60) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,113),I= 61, 66) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,113),I= 67, 72) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,113),I= 73, 78) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,113),I= 79, 84) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,113),I= 85, 90) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,113),I= 91, 96) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,113),I= 97,102) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,113),I=103,108) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I,113),I=109,114) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I,113),I=115,120) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=7225,7232) /4096,-1024,-16,128,-160,-16,-1024,128/ C 1 T(7,5,6,1,2,3,4) - DATA (CF(I,114),I= 1, 6) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,114),I= 7, 12) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,114),I= 13, 18) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,114),I= 19, 24) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I,114),I= 25, 30) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,114),I= 31, 36) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,114),I= 37, 42) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,114),I= 43, 48) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,114),I= 49, 54) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,114),I= 55, 60) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,114),I= 61, 66) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,114),I= 67, 72) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,114),I= 73, 78) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,114),I= 79, 84) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,114),I= 85, 90) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,114),I= 91, 96) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,114),I= 97,102) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I,114),I=103,108) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,114),I=109,114) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I,114),I=115,120) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ + DATA (CF(I),I=7233,7239) /4096,-160,-16,-16,128,128,-1024/ C 1 T(7,5,6,2,1,3,4) - DATA (CF(I,115),I= 1, 6) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,115),I= 7, 12) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,115),I= 13, 18) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,115),I= 19, 24) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,115),I= 25, 30) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,115),I= 31, 36) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,115),I= 37, 42) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,115),I= 43, 48) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,115),I= 49, 54) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,115),I= 55, 60) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,115),I= 61, 66) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,115),I= 67, 72) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,115),I= 73, 78) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,115),I= 79, 84) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,115),I= 85, 90) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,115),I= 91, 96) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,115),I= 97,102) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,115),I=103,108) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,115),I=109,114) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,115),I=115,120) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ + DATA (CF(I),I=7240,7245) /4096,-1024,-1024,128,128,1280/ C 1 T(7,6,1,2,5,3,4) - DATA (CF(I,116),I= 1, 6) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,116),I= 7, 12) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,116),I= 13, 18) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,116),I= 19, 24) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,116),I= 25, 30) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,116),I= 31, 36) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,116),I= 37, 42) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,116),I= 43, 48) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,116),I= 49, 54) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,116),I= 55, 60) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,116),I= 61, 66) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,116),I= 67, 72) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,116),I= 73, 78) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,116),I= 79, 84) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,116),I= 85, 90) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,116),I= 91, 96) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,116),I= 97,102) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,116),I=103,108) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,116),I=109,114) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,116),I=115,120) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ + DATA (CF(I),I=7246,7250) /4096,128,1280,-1024,128/ C 1 T(7,6,1,5,2,3,4) - DATA (CF(I,117),I= 1, 6) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,117),I= 7, 12) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,117),I= 13, 18) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,117),I= 19, 24) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,117),I= 25, 30) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,117),I= 31, 36) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,117),I= 37, 42) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,117),I= 43, 48) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,117),I= 49, 54) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,117),I= 55, 60) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,117),I= 61, 66) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,117),I= 67, 72) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,117),I= 73, 78) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,117),I= 79, 84) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,117),I= 85, 90) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,117),I= 91, 96) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,117),I= 97,102) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,117),I=103,108) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,117),I=109,114) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,117),I=115,120) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ + DATA (CF(I),I=7251,7254) /4096,-1024,1280,128/ C 1 T(7,6,2,1,5,3,4) - DATA (CF(I,118),I= 1, 6) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,118),I= 7, 12) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,118),I= 13, 18) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,118),I= 19, 24) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,118),I= 25, 30) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,118),I= 31, 36) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,118),I= 37, 42) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,118),I= 43, 48) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,118),I= 49, 54) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,118),I= 55, 60) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,118),I= 61, 66) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,118),I= 67, 72) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,118),I= 73, 78) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,118),I= 79, 84) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,118),I= 85, 90) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,118),I= 91, 96) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,118),I= 97,102) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,118),I=103,108) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,118),I=109,114) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,118),I=115,120) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ + DATA (CF(I),I=7255,7257) /4096,128,-1024/ C 1 T(7,6,2,5,1,3,4) - DATA (CF(I,119),I= 1, 6) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,119),I= 7, 12) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,119),I= 13, 18) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,119),I= 19, 24) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,119),I= 25, 30) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,119),I= 31, 36) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,119),I= 37, 42) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,119),I= 43, 48) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,119),I= 49, 54) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,119),I= 55, 60) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,119),I= 61, 66) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,119),I= 67, 72) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,119),I= 73, 78) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,119),I= 79, 84) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,119),I= 85, 90) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,119),I= 91, 96) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,119),I= 97,102) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,119),I=103,108) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,119),I=109,114) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,119),I=115,120) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ + DATA (CF(I),I=7258,7259) /4096,-1024/ C 1 T(7,6,5,1,2,3,4) - DATA (CF(I,120),I= 1, 6) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,120),I= 7, 12) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,120),I= 13, 18) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,120),I= 19, 24) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,120),I= 25, 30) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,120),I= 31, 36) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,120),I= 37, 42) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,120),I= 43, 48) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,120),I= 49, 54) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,120),I= 55, 60) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,120),I= 61, 66) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,120),I= 67, 72) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,120),I= 73, 78) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,120),I= 79, 84) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,120),I= 85, 90) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,120),I= 91, 96) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,120),I= 97,102) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,120),I=103,108) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,120),I=109,114) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,120),I=115,120) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ + DATA (CF(I),I=7260,7260) /4096/ C 1 T(7,6,5,2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -18811,10 +10161,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -18823,6 +10175,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data b/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/madevent b/epochX/cudacpp/gg_ttggg.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/madevent +++ b/epochX/cudacpp/gg_ttggg.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h index 53dd560ed6..da11e740d9 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc index 47a3a011b8..a5e188e4f8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h index 76066c7bb1..24e0e80f84 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 37d3314a5d..4feba239bd 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +57,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005965471267700195  +DEBUG: model prefixing takes 0.004603147506713867  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,33 +150,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.863 s +1 processes with 1240 diagrams generated in 1.800 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.535 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. +Generated helas calls for 1 subprocesses (1240 diagrams) in 5.597 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.348 s +ALOHA: aloha creates 5 routines in 0.328 s VVV1 VVV1 FFV1 @@ -190,17 +189,17 @@ ALOHA: aloha creates 5 routines in 0.348 s VVVV3 VVVV4 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m12.948s -user 0m12.781s -sys 0m0.107s -Code generation completed in 13 seconds +real 0m11.011s +user 0m10.838s +sys 0m0.129s +Code generation completed in 11 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index 07099839d3..763cfce31f 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 120; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities -#endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) +#endif + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -31856,272 +31912,43 @@ namespace mg5amcCpu jamp_sv[116] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxggg()?) - - // The color denominators (initialize all array elements, with ncolor=120) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120] - - // The color matrix (initialize all array elements, with ncolor=120) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 }, - { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 }, - { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 }, - { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 }, - { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 }, - { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 }, - { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 }, - { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 }, - { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 }, - { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 }, - { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 }, - { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 }, - { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 }, - { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 }, - { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 }, - { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 }, - { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 }, - { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 }, - { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 }, - { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 }, - { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 }, - { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 }, - { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 }, - { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 }, - { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 }, - { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 }, - { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 }, - { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 }, - { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 }, - { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 }, - { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 }, - { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 }, - { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 }, - { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 }, - { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 }, - { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 }, - { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 }, - { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 }, - { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 }, - { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 }, - { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 }, - { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 }, - { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 }, - { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 }, - { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 }, - { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 }, - { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 }, - { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 }, - { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 }, - { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 }, - { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 }, - { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 }, - { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 }, - { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 }, - { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 }, - { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 }, - { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 }, - { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 }, - { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 }, - { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 }, - { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 }, - { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 }, - { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 }, - { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 }, - { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 }, - { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 }, - { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 }, - { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 }, - { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 }, - { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 }, - { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 }, - { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 }, - { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 }, - { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 }, - { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 }, - { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 }, - { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 }, - { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 }, - { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 }, - { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 }, - { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 }, - { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 }, - { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 }, - { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 }, - { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 }, - { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 }, - { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 }, - { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 }, - { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 }, - { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 }, - { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 }, - { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 }, - { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 }, - { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 }, - { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 }, - { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 }, - { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 }, - { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 }, - { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 }, - { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 }, - { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 }, - { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 }, - { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 }, - { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 }, - { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 }, - { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 }, - { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 }, - { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 }, - { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 }, - { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 }, - { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 }, - { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 }, - { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 }, - { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 }, - { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 }, - { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 }, - { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 }, - { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 }, - { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, - { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -32273,7 +32100,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -32309,6 +32140,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -32352,6 +32187,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -32472,8 +32311,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -32481,25 +32320,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -32644,13 +32661,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -32662,18 +32673,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -32698,93 +32714,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -32826,7 +32779,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -32849,7 +32802,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -32858,21 +32811,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -32886,8 +32841,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -32903,11 +32860,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -33009,14 +32967,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h index 2eb1e066ff..f20243637a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 128; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 1240; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 120; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc new file mode 100644 index 0000000000..dea7f9fdb2 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc @@ -0,0 +1,545 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=120) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120] + + // The color matrix (initialize all array elements, with ncolor=120) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 }, + { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 }, + { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 }, + { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 }, + { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 }, + { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 }, + { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 }, + { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 }, + { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 }, + { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 }, + { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 }, + { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 }, + { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 }, + { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 }, + { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 }, + { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 }, + { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 }, + { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 }, + { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 }, + { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 }, + { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 }, + { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 }, + { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 }, + { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 }, + { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 }, + { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 }, + { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 }, + { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 }, + { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 }, + { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 }, + { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 }, + { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 }, + { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 }, + { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 }, + { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 }, + { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 }, + { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 }, + { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 }, + { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 }, + { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 }, + { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 }, + { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 }, + { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 }, + { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 }, + { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 }, + { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 }, + { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 }, + { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 }, + { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 }, + { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 }, + { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 }, + { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 }, + { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 }, + { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 }, + { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 }, + { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 }, + { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 }, + { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 }, + { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 }, + { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 }, + { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 }, + { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 }, + { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 }, + { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 }, + { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 }, + { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 }, + { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 }, + { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 }, + { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 }, + { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 }, + { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 }, + { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 }, + { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 }, + { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 }, + { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 }, + { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 }, + { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 }, + { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 }, + { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 }, + { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 }, + { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 }, + { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 }, + { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 }, + { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 }, + { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 }, + { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 }, + { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 }, + { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 }, + { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 }, + { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 }, + { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 }, + { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 }, + { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 }, + { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 }, + { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 }, + { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 }, + { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 }, + { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 }, + { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 }, + { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 }, + { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 }, + { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 }, + { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 }, + { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 }, + { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 }, + { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 }, + { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 }, + { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 }, + { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 }, + { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 }, + { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 }, + { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 }, + { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 }, + { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 }, + { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 }, + { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 }, + { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 }, + { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 }, + { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, + { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fbridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/makefile_original.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h index 53dd560ed6..da11e740d9 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc index 47a3a011b8..a5e188e4f8 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h index 76066c7bb1..24e0e80f84 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h index d3c4ca5695..7d34de72f8 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 1baee42e06..10d129eb59 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +56,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006198406219482422  +DEBUG: model prefixing takes 0.006429910659790039  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -166,21 +165,21 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.078 s +8 processes with 40 diagrams generated in 0.115 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -200,9 +199,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -211,62 +210,50 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1552]  -Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s -Wrote files for 32 helas calls in 0.164 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  +Generated helas calls for 2 subprocesses (10 diagrams) in 0.042 s +Wrote files for 32 helas calls in 0.167 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.148 s +ALOHA: aloha creates 2 routines in 0.135 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.132 s +ALOHA: aloha creates 4 routines in 0.100 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 254 (offset 27 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 254 (offset 27 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m2.210s -user 0m1.890s -sys 0m0.303s -Code generation completed in 2 seconds +real 0m3.071s +user 0m2.389s +sys 0m0.594s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -279,7 +266,7 @@ Code generation completed in 2 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -287,10 +274,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -309,7 +295,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -317,10 +303,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT +++ b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat index 795e11afaf..7e99b87668 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat index 66a805e521..3db737130c 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat @@ -109,6 +109,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat index 8c0f1e2199..47c2051950 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat @@ -109,6 +109,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/Source/.make_opts b/epochX/cudacpp/gq_ttq.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/.make_opts +++ b/epochX/cudacpp/gq_ttq.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f b/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc b/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc +++ b/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/gq_ttq.mad/Source/make_opts b/epochX/cudacpp/gq_ttq.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/make_opts +++ b/epochX/cudacpp/gq_ttq.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/gq_ttq.mad/Source/makefile b/epochX/cudacpp/gq_ttq.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/makefile +++ b/epochX/cudacpp/gq_ttq.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc b/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc +++ b/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 99573ab87a..20611bde8f 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -405,156 +461,43 @@ namespace mg5amcCpu jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gu_ttxu()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -610,7 +553,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -644,6 +591,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -685,6 +636,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -805,8 +760,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -814,25 +769,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -977,13 +1110,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -995,18 +1122,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1031,93 +1163,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1159,7 +1228,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1182,7 +1251,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1191,21 +1260,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1219,8 +1290,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1236,11 +1309,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1342,14 +1416,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index b501a9772e..2c0025c7b9 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f index b0cc58e89c..340d51dbfa 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f index 2b281a8200..83f5f0b209 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE) U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE) @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -486,51 +490,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/configs.inc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/configs.inc index 225cf5aca4..0a6b8dbc07 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/configs.inc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/configs.inc @@ -57,3 +57,5 @@ C Diagram 5 DATA (SPROP(I,-3,5),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/5/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f index c2eadb2c31..aa93a3d195 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fbridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/makefile_original.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f index 1efce64e40..bb9c6d6440 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -254,17 +251,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -341,7 +327,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -387,7 +373,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -430,31 +417,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,3,2) T(5,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,3,4) T(5,2) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,5,2) T(3,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,5,4) T(3,2) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -507,10 +491,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -519,6 +505,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 6dc0abd17c..6dbbb43f91 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -405,156 +461,43 @@ namespace mg5amcCpu jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gux_ttxux()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -610,7 +553,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -644,6 +591,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -685,6 +636,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -805,8 +760,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -814,25 +769,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -977,13 +1110,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -995,18 +1122,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1031,93 +1163,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1159,7 +1228,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1182,7 +1251,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1191,21 +1260,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1219,8 +1290,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1236,11 +1309,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1342,14 +1416,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index d658e0394e..7a811e35e9 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f index e36675626f..f9cde14dc2 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f index 61bb13c3e7..136c6cded7 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -486,51 +490,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/configs.inc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/configs.inc index 693e4354b0..28a94fd35a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/configs.inc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/configs.inc @@ -57,3 +57,5 @@ C Diagram 5 DATA (SPROP(I,-3,5),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/5/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f index c2eadb2c31..aa93a3d195 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fbridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/makefile_original.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f index c8fbbe9e22..49b7ddbf25 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -254,17 +251,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -341,7 +327,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -387,7 +373,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -430,31 +417,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,2,4) T(3,5) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,2,5) T(3,4) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,3,4) T(2,5) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,3,5) T(2,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -507,10 +491,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -519,6 +505,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data b/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/gq_ttq.mad/bin/madevent b/epochX/cudacpp/gq_ttq.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/madevent +++ b/epochX/cudacpp/gq_ttq.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h index a304fc85c8..c6aa6132b8 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc index 998cb505a0..c5d271333d 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h index 1565ed5888..890ccfa493 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 8249ac5d67..22b4bcef38 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +56,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006358146667480469  +DEBUG: model prefixing takes 0.004916191101074219  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -166,13 +165,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.078 s +8 processes with 40 diagrams generated in 0.068 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -184,45 +183,45 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=1 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=1 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. +Generated helas calls for 2 subprocesses (10 diagrams) in 0.028 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.145 s +ALOHA: aloha creates 2 routines in 0.121 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.670s -user 0m0.588s -sys 0m0.061s +real 0m0.648s +user 0m0.562s +sys 0m0.079s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT +++ b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index 81ab8669a5..7307dc9db3 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -400,156 +456,43 @@ namespace mg5amcCpu jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gu_ttxu()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -605,7 +548,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -639,6 +586,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -680,6 +631,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -800,8 +755,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -809,25 +764,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -972,13 +1105,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -990,18 +1117,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1026,93 +1158,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1154,7 +1223,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1177,7 +1246,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1186,21 +1255,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1214,8 +1285,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1231,11 +1304,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1337,14 +1411,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h index b501a9772e..2c0025c7b9 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fbridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/makefile_original.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index c1c42990a2..d959d2636a 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -400,156 +456,43 @@ namespace mg5amcCpu jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gux_ttxux()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -605,7 +548,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -639,6 +586,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -680,6 +631,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -800,8 +755,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -809,25 +764,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -972,13 +1105,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -990,18 +1117,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1026,93 +1158,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1154,7 +1223,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1177,7 +1246,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1186,21 +1255,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1214,8 +1285,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1231,11 +1304,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1337,14 +1411,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h index d658e0394e..7a811e35e9 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fbridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/makefile_original.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h index a304fc85c8..c6aa6132b8 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc index 998cb505a0..c5d271333d 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h index 1565ed5888..890ccfa493 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h index d3c4ca5695..7d34de72f8 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index c46ef95a65..f4896d16ca 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,17 +46,16 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  @@ -123,21 +122,21 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ generate g g > b b~ HIW<=1 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Process has 4 diagrams -1 processes with 4 diagrams generated in 0.006 s +1 processes with 4 diagrams generated in 0.007 s Total: 1 processes with 4 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb INFO: remove old information in CODEGEN_mad_heft_gg_bb -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -149,59 +148,54 @@ FileWriter b b~ HIG<=1 HIW<=1 @1 INFO: Finding symmetric diagrams for subprocess group gg_bbx -DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1552]  -Generated helas calls for 1 subprocesses (4 diagrams) in 0.009 s -Wrote files for 12 helas calls in 0.076 s +DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1577]  +Generated helas calls for 1 subprocesses (4 diagrams) in 0.011 s +Wrote files for 12 helas calls in 0.085 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.266 s +ALOHA: aloha creates 4 routines in 0.231 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.250 s +ALOHA: aloha creates 8 routines in 0.197 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses/P1_gg_bbx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README Run "open index.html" to see more information about this process. quit -real 0m2.141s -user 0m1.860s -sys 0m0.270s +real 0m2.628s +user 0m2.164s +sys 0m0.450s Code generation completed in 2 seconds ************************************************************ * * @@ -215,7 +209,7 @@ Code generation completed in 2 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -223,10 +217,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -245,7 +238,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -253,10 +246,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/heft_gg_bb.mad/COPYRIGHT b/epochX/cudacpp/heft_gg_bb.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/COPYRIGHT +++ b/epochX/cudacpp/heft_gg_bb.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat index 92581deeee..abc60404ab 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat index 8af20dc4e4..3802880982 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat index 0815703ee4..6917ce597f 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt b/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts b/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f b/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc b/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/makefile b/epochX/cudacpp/heft_gg_bb.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/makefile +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc b/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h index 90075da66e..7d7b960511 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_heft_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc index 5d6a4e1f06..4630760b2c 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_heft.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_heft_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_heft_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 3; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -381,155 +437,43 @@ namespace mg5amcCpu jamp_sv[1] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_bbx()?) - - // The color denominators (initialize all array elements, with ncolor=3) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 1 }; // 1-D array[3] - - // The color matrix (initialize all array elements, with ncolor=3) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2, 6 }, - { -2, 16, 6 }, - { 2, 2, 6 } }; // 2-D array[3][3] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -569,7 +513,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -602,6 +550,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MB ); m_masses.push_back( m_pars->mdl_MB ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MB, (fptype)m_pars->mdl_MH, (fptype)m_pars->mdl_WH }; @@ -643,6 +595,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_heft::ZERO ); m_masses.push_back( Parameters_heft::mdl_MB ); m_masses.push_back( Parameters_heft::mdl_MB ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -763,8 +719,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -772,25 +728,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -935,13 +1069,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -953,18 +1081,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -989,93 +1122,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1117,7 +1187,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1140,7 +1210,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1149,21 +1219,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1177,8 +1249,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1194,11 +1268,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1300,14 +1375,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h index 30c5663297..cacb35c052 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_heft.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 4; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 3; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f index 0b39d55964..263997c37e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f index c57e06d578..3eaacf358b 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc new file mode 100644 index 0000000000..94b1137d64 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc @@ -0,0 +1,428 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=3) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 1 }; // 1-D array[3] + + // The color matrix (initialize all array elements, with ncolor=3) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2, 6 }, + { -2, 16, 6 }, + { 2, 2, 6 } }; // 2-D array[3][3] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/configs.inc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/configs.inc index b94e284b2f..8f12a38cbe 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/configs.inc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/configs.inc @@ -30,3 +30,5 @@ C Diagram 4 DATA (SPROP(I,-2,4),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/4/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f index ec5722702a..30cca27587 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fbridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/makefile_original.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f index 598338d03e..e5700f7694 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -227,17 +224,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -307,7 +293,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -350,7 +336,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -393,26 +380,26 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 3) /5.333333333333333D+00, - $ -6.666666666666666D-01,2.000000000000000D+00/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 3) /16,-4,12/ C 1 T(1,2,3,4) - DATA (CF(I, 2),I= 1, 3) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,2.000000000000000D+00/ + DATA (CF(I),I= 4, 5) /16,12/ C 1 T(2,1,3,4) - DATA (CF(I, 3),I= 1, 3) /2.000000000000000D+00 - $ ,2.000000000000000D+00,6.000000000000000D+00/ + DATA (CF(I),I= 6, 6) /18/ C 1 T(3,4) Tr(1,2) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(MDL_MB - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WH.NE.0D0) FK_MDL_WH = SIGN(MAX(ABS(MDL_WH), ABS(MDL_MH - $ *SMALL_WIDTH_TREATMENT)), MDL_WH) + FK_ZERO = 0D0 + IF(MDL_WH.NE.0D0) THEN + FK_MDL_WH = SIGN(MAX(ABS(MDL_WH), ABS(MDL_MH + $ *SMALL_WIDTH_TREATMENT)), MDL_WH) + ELSE + FK_MDL_WH = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -455,10 +442,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -467,6 +456,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/madevent b/epochX/cudacpp/heft_gg_bb.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/madevent +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h index 1b04401547..534bb65c13 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc index 0fa5a34cf0..3b4c719337 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h index 0faa7bb71e..7ab2db5300 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index 04039fcd14..c8cdee7d2a 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,17 +46,16 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  @@ -123,49 +122,49 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ generate g g > b b~ HIW<=1 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Process has 4 diagrams -1 processes with 4 diagrams generated in 0.006 s +1 processes with 4 diagrams generated in 0.008 s Total: 1 processes with 4 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. -Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. +Generated helas calls for 1 subprocesses (4 diagrams) in 0.010 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.261 s +ALOHA: aloha creates 4 routines in 0.214 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m0.646s -user 0m0.583s -sys 0m0.051s -Code generation completed in 1 seconds +real 0m0.656s +user 0m0.586s +sys 0m0.063s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/heft_gg_bb.sa/COPYRIGHT b/epochX/cudacpp/heft_gg_bb.sa/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/COPYRIGHT +++ b/epochX/cudacpp/heft_gg_bb.sa/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h index 90075da66e..7d7b960511 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_heft_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc index b9f394434a..e9ac65dc13 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_heft.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_heft_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_heft_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 3; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -377,155 +433,43 @@ namespace mg5amcCpu jamp_sv[1] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_bbx()?) - - // The color denominators (initialize all array elements, with ncolor=3) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 1 }; // 1-D array[3] - - // The color matrix (initialize all array elements, with ncolor=3) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2, 6 }, - { -2, 16, 6 }, - { 2, 2, 6 } }; // 2-D array[3][3] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -565,7 +509,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -598,6 +546,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MB ); m_masses.push_back( m_pars->mdl_MB ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MB, (fptype)m_pars->mdl_MH, (fptype)m_pars->mdl_WH }; @@ -639,6 +591,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_heft::ZERO ); m_masses.push_back( Parameters_heft::mdl_MB ); m_masses.push_back( Parameters_heft::mdl_MB ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -759,8 +715,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -768,25 +724,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -931,13 +1065,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -949,18 +1077,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -985,93 +1118,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1113,7 +1183,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1136,7 +1206,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1145,21 +1215,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1173,8 +1245,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1190,11 +1264,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1296,14 +1371,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h index 30c5663297..cacb35c052 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_heft.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 4; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 3; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc new file mode 100644 index 0000000000..94b1137d64 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc @@ -0,0 +1,428 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=3) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 1 }; // 1-D array[3] + + // The color matrix (initialize all array elements, with ncolor=3) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2, 6 }, + { -2, 16, 6 }, + { 2, 2, 6 } }; // 2-D array[3][3] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fbridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/makefile_original.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h index 1b04401547..534bb65c13 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc index 0fa5a34cf0..3b4c719337 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h index 0faa7bb71e..7ab2db5300 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h index d3c4ca5695..7d34de72f8 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt index 11380fe474..1526092aa7 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +56,7 @@ set zerowidth_tchannel F import model sm-no_b_mass INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006017446517944336  +DEBUG: model prefixing takes 0.006911277770996094  INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -181,7 +180,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w- INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ -4 processes with 8 diagrams generated in 0.107 s +4 processes with 8 diagrams generated in 0.123 s Total: 4 processes with 8 diagrams add process p p > t t~ w j @1 INFO: Checking for minimal orders which gives processes. @@ -223,21 +222,21 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~ INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g -12 processes with 144 diagrams generated in 0.640 s +12 processes with 144 diagrams generated in 0.841 s Total: 16 processes with 152 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW INFO: remove old information in CODEGEN_mad_nobm_pp_ttW -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 INFO: Processing color information for process: g u > t t~ w+ d @1 @@ -271,9 +270,9 @@ FileWriter t t~ w+ d WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxwpd -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  INFO: Creating files in directory P1_gd_ttxwmu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -282,9 +281,9 @@ FileWriter t t~ w- u WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gd_ttxwmu -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  INFO: Creating files in directory P1_gux_ttxwmdx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -293,9 +292,9 @@ FileWriter t t~ w- d~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxwmdx -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  INFO: Creating files in directory P1_gdx_ttxwpux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -304,9 +303,9 @@ FileWriter t t~ w+ u~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gdx_ttxwpux -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  INFO: Creating files in directory P1_udx_ttxwpg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -315,9 +314,9 @@ FileWriter t t~ w+ g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group udx_ttxwpg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  INFO: Creating files in directory P1_dux_ttxwmg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -326,9 +325,9 @@ FileWriter t t~ w- g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group dux_ttxwmg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  INFO: Creating files in directory P0_udx_ttxwp DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -337,9 +336,9 @@ FileWriter t t~ w+ WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group udx_ttxwp -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  INFO: Creating files in directory P0_dux_ttxwm DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -348,21 +347,21 @@ FileWriter t t~ w- WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group dux_ttxwm -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1552]  -Generated helas calls for 8 subprocesses (76 diagrams) in 0.202 s -Wrote files for 212 helas calls in 0.830 s +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  +Generated helas calls for 8 subprocesses (76 diagrams) in 0.220 s +Wrote files for 212 helas calls in 1.061 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 3 routines in 0.204 s +ALOHA: aloha creates 3 routines in 0.221 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 6 routines in 0.200 s +ALOHA: aloha creates 6 routines in 0.185 s FFV1 FFV1 FFV1 @@ -370,74 +369,32 @@ ALOHA: aloha creates 6 routines in 0.200 s FFV2 FFV2 VVV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h -INFO: Created file HelAmps_sm_no_b_mass.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h +INFO: Created file HelAmps_sm_no_b_mass.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P0_dux_ttxwm; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 72 (offset 1 line). -Hunk #2 succeeded at 268 (offset 41 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P0_udx_ttxwp; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 72 (offset 1 line). -Hunk #2 succeeded at 268 (offset 41 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_dux_ttxwmg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 72 (offset 1 line). -Hunk #2 succeeded at 316 (offset 89 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gd_ttxwmu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 72 (offset 1 line). -Hunk #2 succeeded at 316 (offset 89 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gdx_ttxwpux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 72 (offset 1 line). -Hunk #2 succeeded at 316 (offset 89 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gu_ttxwpd; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 72 (offset 1 line). -Hunk #2 succeeded at 316 (offset 89 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gux_ttxwmdx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 72 (offset 1 line). -Hunk #2 succeeded at 316 (offset 89 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_udx_ttxwpg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 72 (offset 1 line). -Hunk #2 succeeded at 316 (offset 89 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README Run "open index.html" to see more information about this process. quit -real 0m4.658s -user 0m4.105s -sys 0m0.537s -Code generation completed in 5 seconds +real 0m6.431s +user 0m5.417s +sys 0m0.963s +Code generation completed in 6 seconds ************************************************************ * * * W E L C O M E to * @@ -450,7 +407,7 @@ Code generation completed in 5 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -458,10 +415,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -480,7 +436,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -488,10 +444,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/COPYRIGHT b/epochX/cudacpp/nobm_pp_ttW.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/COPYRIGHT +++ b/epochX/cudacpp/nobm_pp_ttW.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat index 72b31976a0..961c6b1d6e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat index 5eca3e3f2b..48beb899d9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat @@ -127,6 +127,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat index 3b445d02a0..c22a9e0249 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat @@ -127,6 +127,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt b/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts b/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f b/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc b/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile b/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc b/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc index 2588190439..e169c1f193 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h index 1e7cc050f7..71a4c3f155 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_no_b_mass_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc index 97050f0aa2..f17f7676e3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,9 +99,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -107,10 +110,7 @@ namespace mg5amcCpu using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,43 +169,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -217,7 +273,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -226,14 +281,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -259,14 +317,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -290,7 +344,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -304,7 +357,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -315,6 +367,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -360,154 +416,43 @@ namespace mg5amcCpu jamp_sv[1] -= 1. / 2. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_dux_ttxwm()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 9, 3 }, - { 3, 9 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -579,7 +524,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -613,6 +562,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -655,6 +608,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -775,8 +732,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -784,25 +741,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -947,13 +1082,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -965,18 +1094,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1001,93 +1135,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1129,7 +1200,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1152,7 +1223,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1161,21 +1232,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1189,8 +1262,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1206,11 +1281,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1312,14 +1388,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h index 9d6c262053..a1daef0aaa 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,6 +77,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 48; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -123,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -131,9 +133,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -153,34 +157,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f index 7f7324dc0b..a5edcacd08 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f index 08dd1f728a..b967ccba14 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) S1=PDG2PDF(LPP(IB(1)),3, IB(1),XBK(IB(1)), QSCALE) @@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) UX2=PDG2PDF(LPP(IB(2)),-2, IB(2),XBK(IB(2)), QSCALE) @@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -459,51 +463,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc new file mode 100644 index 0000000000..04c22fd369 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc @@ -0,0 +1,427 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 9, 3 }, + { 3, 9 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/configs.inc index a4ca4e23a5..3d0bd5df67 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/configs.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/configs.inc @@ -24,3 +24,5 @@ C Diagram 2 DATA (SPROP(I,-3,2),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/2/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f index 531dfa0771..51ded2dd76 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f index 5c47e1c729..f350dd008d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -268,17 +265,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -355,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -399,7 +385,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(1) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -444,23 +431,31 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /9.000000000000000D+00 - $ ,3.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 2) /9,6/ C 1 T(2,1) T(3,4) - DATA (CF(I, 2),I= 1, 2) /3.000000000000000D+00 - $ ,9.000000000000000D+00/ + DATA (CF(I),I= 3, 3) /9/ C 1 T(2,4) T(3,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -497,10 +492,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -509,6 +506,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc index 57246ba1e7..80467846e0 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,9 +99,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -107,10 +110,7 @@ namespace mg5amcCpu using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,43 +169,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -217,7 +273,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -226,14 +281,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -259,14 +317,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -290,7 +344,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -304,7 +357,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -315,6 +367,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -360,154 +416,43 @@ namespace mg5amcCpu jamp_sv[1] -= 1. / 2. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_udx_ttxwp()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 9, 3 }, - { 3, 9 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -579,7 +524,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -613,6 +562,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -655,6 +608,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -775,8 +732,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -784,25 +741,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -947,13 +1082,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -965,18 +1094,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1001,93 +1135,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1129,7 +1200,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1152,7 +1223,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1161,21 +1232,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1189,8 +1262,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1206,11 +1281,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1312,14 +1388,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h index cd8edd3e39..a193c09aed 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,6 +77,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 48; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -123,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -131,9 +133,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -153,34 +157,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f index 2e439af0a3..817d8f646f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f index 0808ce67ce..01d47ba27c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) C1=PDG2PDF(LPP(IB(1)),4, IB(1),XBK(IB(1)), QSCALE) @@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) DX2=PDG2PDF(LPP(IB(2)),-1, IB(2),XBK(IB(2)), QSCALE) @@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -459,51 +463,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc new file mode 100644 index 0000000000..04c22fd369 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc @@ -0,0 +1,427 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 9, 3 }, + { 3, 9 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/configs.inc index fd7f72bff4..2a57ec47a3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/configs.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/configs.inc @@ -24,3 +24,5 @@ C Diagram 2 DATA (SPROP(I,-3,2),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/2/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f index 531dfa0771..51ded2dd76 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f index bbf708250a..357edfe899 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -268,17 +265,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -355,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -399,7 +385,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(1) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -444,23 +431,31 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /9.000000000000000D+00 - $ ,3.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 2) /9,6/ C 1 T(2,1) T(3,4) - DATA (CF(I, 2),I= 1, 2) /3.000000000000000D+00 - $ ,9.000000000000000D+00/ + DATA (CF(I),I= 3, 3) /9/ C 1 T(2,4) T(3,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -497,10 +492,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -509,6 +506,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc index 3261780672..e3a7b6109e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,9 +99,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -107,10 +110,7 @@ namespace mg5amcCpu using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,43 +169,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -217,7 +273,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -226,14 +281,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -259,14 +317,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -290,7 +344,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -304,7 +357,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -315,6 +367,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -506,156 +562,43 @@ namespace mg5amcCpu jamp_sv[3] += 1. / 6. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_dux_ttxwmg()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -775,7 +718,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -810,6 +757,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -853,6 +804,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); m_masses.push_back( Parameters_sm_no_b_mass::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -973,8 +928,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -982,25 +937,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1145,13 +1278,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1163,18 +1290,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1199,93 +1331,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1327,7 +1396,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1350,7 +1419,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1359,21 +1428,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1387,8 +1458,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1404,11 +1477,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1510,14 +1584,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h index ecb184f729..582051038f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,6 +77,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -123,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -131,9 +133,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -153,34 +157,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f index 26d6979a1d..7b6075d0bd 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f index 330b566ed8..80270f0371 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) S1=PDG2PDF(LPP(IB(1)),3, IB(1),XBK(IB(1)), QSCALE) @@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) UX2=PDG2PDF(LPP(IB(2)),-2, IB(2),XBK(IB(2)), QSCALE) @@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -459,51 +463,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/configs.inc index 137b6b3695..b386c37679 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/configs.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/configs.inc @@ -180,3 +180,5 @@ C Diagram 12 DATA (SPROP(I,-4,12),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/12/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f index d8518f17f7..439883b7b1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f index 4b8ccfcacb..9eade535f2 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -316,17 +313,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -403,7 +389,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -447,7 +433,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(7) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -492,33 +479,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(2,1) T(6,3,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(2,4) T(6,3,1) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(3,1) T(6,2,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(3,4) T(6,2,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -601,10 +590,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -613,6 +604,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc index c933a8f276..dc3d8b4896 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,9 +99,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -107,10 +110,7 @@ namespace mg5amcCpu using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,43 +169,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -217,7 +273,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -226,14 +281,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -259,14 +317,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -290,7 +344,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -304,7 +357,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -315,6 +367,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -506,156 +562,43 @@ namespace mg5amcCpu jamp_sv[3] -= 1. / 2. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gd_ttxwmu()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -775,7 +718,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -810,6 +757,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -853,6 +804,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); m_masses.push_back( Parameters_sm_no_b_mass::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -973,8 +928,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -982,25 +937,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1145,13 +1278,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1163,18 +1290,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1199,93 +1331,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1327,7 +1396,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1350,7 +1419,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1359,21 +1428,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1387,8 +1458,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1404,11 +1477,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1510,14 +1584,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h index a5c44d3213..1510b6bae5 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,6 +77,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -123,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -131,9 +133,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -153,34 +157,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f index 3779397ce4..e19077b3dc 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f index 1dae307565..4ce490707d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE) S2=PDG2PDF(LPP(IB(2)),3, IB(2),XBK(IB(2)), QSCALE) @@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -456,51 +460,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/configs.inc index 4cdcf03d63..03c4795328 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/configs.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/configs.inc @@ -174,3 +174,5 @@ C Diagram 12 DATA (SPROP(I,-4,12),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/12/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f index d8518f17f7..439883b7b1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f index a3a57cd8b8..e520ea078c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -316,17 +313,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -403,7 +389,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -447,7 +433,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(7) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -492,33 +479,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,3,2) T(6,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,3,4) T(6,2) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,6,2) T(3,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,6,4) T(3,2) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -601,10 +590,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -613,6 +604,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc index 6f1f37d1eb..e2a0d50b47 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,9 +99,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -107,10 +110,7 @@ namespace mg5amcCpu using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,43 +169,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -217,7 +273,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -226,14 +281,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -259,14 +317,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -290,7 +344,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -304,7 +357,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -315,6 +367,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -506,156 +562,43 @@ namespace mg5amcCpu jamp_sv[1] -= 1. / 6. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gdx_ttxwpux()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -775,7 +718,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -810,6 +757,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -853,6 +804,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); m_masses.push_back( Parameters_sm_no_b_mass::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -973,8 +928,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -982,25 +937,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1145,13 +1278,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1163,18 +1290,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1199,93 +1331,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1327,7 +1396,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1350,7 +1419,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1359,21 +1428,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1387,8 +1458,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1404,11 +1477,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1510,14 +1584,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h index d0dd16c512..28103f2454 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,6 +77,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -123,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -131,9 +133,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -153,34 +157,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f index 7c1bbde100..181a9c7408 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f index ece4509a8c..e6c9ab31c6 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) DX2=PDG2PDF(LPP(IB(2)),-1, IB(2),XBK(IB(2)), QSCALE) @@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -456,51 +460,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/configs.inc index 54530d6f24..b65b28a284 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/configs.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/configs.inc @@ -174,3 +174,5 @@ C Diagram 12 DATA (SPROP(I,-4,12),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/12/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f index d8518f17f7..439883b7b1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f index e550640e16..e06cd80f95 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -316,17 +313,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -403,7 +389,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -447,7 +433,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(7) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -492,33 +479,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,2,4) T(3,6) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,2,6) T(3,4) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,3,4) T(2,6) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,3,6) T(2,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -601,10 +590,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -613,6 +604,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc index 16d1e89a53..3524120821 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,9 +99,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -107,10 +110,7 @@ namespace mg5amcCpu using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,43 +169,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -217,7 +273,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -226,14 +281,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -259,14 +317,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -290,7 +344,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -304,7 +357,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -315,6 +367,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -506,156 +562,43 @@ namespace mg5amcCpu jamp_sv[3] -= 1. / 2. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gu_ttxwpd()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -775,7 +718,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -810,6 +757,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -853,6 +804,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); m_masses.push_back( Parameters_sm_no_b_mass::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -973,8 +928,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -982,25 +937,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1145,13 +1278,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1163,18 +1290,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1199,93 +1331,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1327,7 +1396,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1350,7 +1419,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1359,21 +1428,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1387,8 +1458,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1404,11 +1477,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1510,14 +1584,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h index f799f32129..3be1db3774 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,6 +77,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -123,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -131,9 +133,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -153,34 +157,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f index e5ddbf348a..f1c5e0251f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f index 4ebece2e78..3bde9a0625 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE) C2=PDG2PDF(LPP(IB(2)),4, IB(2),XBK(IB(2)), QSCALE) @@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -456,51 +460,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/configs.inc index 7767ae3d5e..d95072bf21 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/configs.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/configs.inc @@ -174,3 +174,5 @@ C Diagram 12 DATA (SPROP(I,-4,12),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/12/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f index d8518f17f7..439883b7b1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f index 738301d049..4cc5183dce 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -316,17 +313,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -403,7 +389,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -447,7 +433,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(7) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -492,33 +479,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,3,2) T(6,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,3,4) T(6,2) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,6,2) T(3,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,6,4) T(3,2) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -601,10 +590,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -613,6 +604,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc index 41a6e0002f..4688e54d18 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,9 +99,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -107,10 +110,7 @@ namespace mg5amcCpu using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,43 +169,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -217,7 +273,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -226,14 +281,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -259,14 +317,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -290,7 +344,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -304,7 +357,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -315,6 +367,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -506,156 +562,43 @@ namespace mg5amcCpu jamp_sv[1] -= 1. / 6. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gux_ttxwmdx()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -775,7 +718,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -810,6 +757,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -853,6 +804,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); m_masses.push_back( Parameters_sm_no_b_mass::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -973,8 +928,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -982,25 +937,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1145,13 +1278,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1163,18 +1290,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1199,93 +1331,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1327,7 +1396,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1350,7 +1419,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1359,21 +1428,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1387,8 +1458,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1404,11 +1477,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1510,14 +1584,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h index b6253b6715..aaf804d7b8 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,6 +77,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -123,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -131,9 +133,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -153,34 +157,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f index 8e03eed7eb..e58319a9cb 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f index 9d0ddcecfc..2361e40053 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) UX2=PDG2PDF(LPP(IB(2)),-2, IB(2),XBK(IB(2)), QSCALE) @@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -456,51 +460,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/configs.inc index 5b08a7cb7c..644de652d9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/configs.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/configs.inc @@ -174,3 +174,5 @@ C Diagram 12 DATA (SPROP(I,-4,12),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/12/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f index d8518f17f7..439883b7b1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f index 6b3ff14d2d..676bb91921 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -316,17 +313,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -403,7 +389,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -447,7 +433,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(7) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -492,33 +479,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,2,4) T(3,6) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,2,6) T(3,4) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,3,4) T(2,6) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,3,6) T(2,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -601,10 +590,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -613,6 +604,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc index f90db593a9..c5be0b0677 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,9 +99,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -107,10 +110,7 @@ namespace mg5amcCpu using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,43 +169,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -217,7 +273,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -226,14 +281,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -259,14 +317,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -290,7 +344,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -304,7 +357,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -315,6 +367,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -506,156 +562,43 @@ namespace mg5amcCpu jamp_sv[3] += 1. / 6. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_udx_ttxwpg()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -775,7 +718,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -810,6 +757,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -853,6 +804,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); m_masses.push_back( Parameters_sm_no_b_mass::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -973,8 +928,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -982,25 +937,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1145,13 +1278,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1163,18 +1290,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1199,93 +1331,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1327,7 +1396,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1350,7 +1419,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1359,21 +1428,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1387,8 +1458,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1404,11 +1477,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1510,14 +1584,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h index b4a0ccb74d..fc664f5841 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,6 +77,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -123,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -131,9 +133,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -153,34 +157,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f index 7e750641c8..1ca5709cc3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f index 28ad0eed08..ca38b13683 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) C1=PDG2PDF(LPP(IB(1)),4, IB(1),XBK(IB(1)), QSCALE) @@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) DX2=PDG2PDF(LPP(IB(2)),-1, IB(2),XBK(IB(2)), QSCALE) @@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -459,51 +463,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/configs.inc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/configs.inc index 939cb376b9..d418740afe 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/configs.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/configs.inc @@ -180,3 +180,5 @@ C Diagram 12 DATA (SPROP(I,-4,12),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/12/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f index d8518f17f7..439883b7b1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f index 536bec2827..f501bedaee 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -72,10 +72,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -316,17 +313,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -403,7 +389,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -447,7 +433,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(7) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -492,33 +479,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(2,1) T(6,3,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(2,4) T(6,3,1) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(3,1) T(6,2,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(3,4) T(6,2,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -601,10 +590,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -613,6 +604,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent b/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h index 850b86e0e6..9d6ce139ee 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc index d799b19eeb..cbce3f44c5 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h index e448052141..0fbfb533e9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 0a0d056033..a93dec7f6c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +56,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0064830780029296875  +DEBUG: model prefixing takes 0.005368709564208984  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -168,7 +167,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.030 s +5 processes with 7 diagrams generated in 0.032 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -374,21 +373,21 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.933 s +65 processes with 1119 diagrams generated in 1.876 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Processing color information for process: g g > t t~ g g @2 @@ -499,9 +498,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1577]  INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -510,9 +509,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1577]  INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -521,9 +520,9 @@ FileWriter t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1577]  INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -532,9 +531,9 @@ FileWriter t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1577]  INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -543,9 +542,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1577]  INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -554,9 +553,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1577]  INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -565,9 +564,9 @@ FileWriter t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1577]  INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -576,9 +575,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1577]  INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -587,9 +586,9 @@ FileWriter t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1577]  INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -598,9 +597,9 @@ FileWriter t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1577]  INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -609,9 +608,9 @@ FileWriter t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1577]  INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -620,9 +619,9 @@ FileWriter t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1577]  INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -631,9 +630,9 @@ FileWriter t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1577]  INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -642,9 +641,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -653,9 +652,9 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -664,9 +663,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -675,9 +674,9 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -686,25 +685,25 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1552]  -Generated helas calls for 18 subprocesses (372 diagrams) in 1.286 s -Wrote files for 810 helas calls in 2.762 s +DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1577]  +Generated helas calls for 18 subprocesses (372 diagrams) in 1.276 s +Wrote files for 810 helas calls in 3.216 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.340 s +ALOHA: aloha creates 5 routines in 0.337 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.315 s +ALOHA: aloha creates 10 routines in 0.332 s VVV1 VVV1 FFV1 @@ -717,120 +716,32 @@ ALOHA: aloha creates 10 routines in 0.315 s VVVV3 VVVV4 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_uux_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 230 (offset 3 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #2 succeeded at 243 (offset 16 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 246 (offset 19 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 246 (offset 19 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_uux_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 246 (offset 19 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #2 succeeded at 275 (offset 48 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxuux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gu_ttxgu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gux_ttxgux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uc_ttxuc; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 76 (offset 5 lines). -Hunk #2 succeeded at 280 (offset 53 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_ucx_ttxucx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 82 (offset 11 lines). -Hunk #2 succeeded at 286 (offset 59 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uu_ttxuu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxccx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 82 (offset 11 lines). -Hunk #2 succeeded at 286 (offset 59 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxuux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxcx_ttxuxcx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 76 (offset 5 lines). -Hunk #2 succeeded at 280 (offset 53 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxux_ttxuxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #1 succeeded at 74 (offset 3 lines). -Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m11.258s -user 0m9.633s -sys 0m0.984s -Code generation completed in 12 seconds +real 0m13.682s +user 0m11.871s +sys 0m1.595s +Code generation completed in 14 seconds ************************************************************ * * * W E L C O M E to * @@ -843,7 +754,7 @@ Code generation completed in 12 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -851,10 +762,9 @@ Code generation completed in 12 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -873,7 +783,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -881,10 +791,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT +++ b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat index 33311e49bc..92b8989f46 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat index 5eb60f35df..fe9c38d826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat @@ -125,6 +125,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat index 38810a6b83..0185201786 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat @@ -125,6 +125,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt +++ b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts +++ b/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f b/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc b/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc +++ b/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts +++ b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/makefile b/epochX/cudacpp/pp_tt012j.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/makefile +++ b/epochX/cudacpp/pp_tt012j.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc b/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc index 2588190439..e169c1f193 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc +++ b/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h index 65a101888d..2fa0ce29e0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index a17c5f1eef..1d67401043 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -368,154 +424,43 @@ namespace mg5amcCpu jamp_sv[1] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_gg_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -555,7 +500,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -588,6 +537,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -628,6 +581,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -748,8 +705,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -757,25 +714,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -920,13 +1055,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -938,18 +1067,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -974,93 +1108,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1102,7 +1173,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1125,7 +1196,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1134,21 +1205,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1162,8 +1235,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1179,11 +1254,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1285,14 +1361,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h index 2d89e0e244..c4a9fe53db 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f index 19278bca59..f2058f757e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f index 42cc7c9d61..325bd60fb1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc new file mode 100644 index 0000000000..b68b9250fd --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc @@ -0,0 +1,427 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/configs.inc index 99d3eecc56..0dbac30825 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/configs.inc @@ -24,3 +24,5 @@ C Diagram 3 DATA (SPROP(I,-2,3),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/3/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f index ec5722702a..30cca27587 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f index ca1785b808..a2d45dc02c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -227,17 +224,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -307,7 +293,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -350,7 +336,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -393,21 +380,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /5.333333333333333D+00, - $ -6.666666666666666D-01/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 2) /16,-4/ C 1 T(1,2,3,4) - DATA (CF(I, 2),I= 1, 2) /-6.666666666666666D-01 - $ ,5.333333333333333D+00/ + DATA (CF(I),I= 3, 3) /16/ C 1 T(2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -446,10 +436,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -458,6 +450,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index 0979455d7a..24c9be9271 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -345,154 +401,43 @@ namespace mg5amcCpu jamp_sv[1] -= 1. / 2. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_uux_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 9, 3 }, - { 3, 9 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -532,7 +477,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -565,6 +514,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -605,6 +558,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -725,8 +682,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -734,25 +691,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -897,13 +1032,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -915,18 +1044,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -951,93 +1085,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1079,7 +1150,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1102,7 +1173,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1111,21 +1182,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1139,8 +1212,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1156,11 +1231,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1262,14 +1338,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h index d6fa3205c0..b2f1c18fba 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 1; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f index 6558c40922..cfdb6645ac 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f index 86f844defe..779ad4cdc1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -497,51 +501,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc new file mode 100644 index 0000000000..04c22fd369 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc @@ -0,0 +1,427 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 9, 3 }, + { 3, 9 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/configs.inc index a99b3c9fba..ef48c8df8d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/configs.inc @@ -6,3 +6,5 @@ C Diagram 1 DATA TPRID(-1,1)/0/ C Number of configs DATA MAPCONFIG(0)/1/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f index ec5722702a..30cca27587 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f index ec88a303fa..34923f2e60 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -230,17 +227,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -310,7 +296,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -356,7 +342,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -399,21 +386,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /9.000000000000000D+00 - $ ,3.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 2) /9,6/ C 1 T(2,1) T(3,4) - DATA (CF(I, 2),I= 1, 2) /3.000000000000000D+00 - $ ,9.000000000000000D+00/ + DATA (CF(I),I= 3, 3) /9/ C 1 T(2,4) T(3,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -444,10 +434,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -456,6 +448,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 5de1c626c8..037b031386 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -567,158 +623,43 @@ namespace mg5amcCpu jamp_sv[5] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxg()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 64, -8, -8, 1, 1, 10 }, - { -8, 64, 1, 10, -8, 1 }, - { -8, 1, 64, -8, 10, 1 }, - { 1, 10, -8, 64, 1, -8 }, - { 1, -8, 10, 1, 64, -8 }, - { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -774,7 +715,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -808,6 +753,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -849,6 +798,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -969,8 +922,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -978,25 +931,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1141,13 +1272,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1159,18 +1284,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1195,93 +1325,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1323,7 +1390,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1346,7 +1413,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1355,21 +1422,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1383,8 +1452,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1400,11 +1471,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1506,14 +1578,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 2acfa000a7..69d8ea8b08 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f index 10496aa04d..19937ed005 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f index 7c8695090c..9e5f9c9b0a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc new file mode 100644 index 0000000000..9e3ce9d917 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc @@ -0,0 +1,431 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 64, -8, -8, 1, 1, 10 }, + { -8, 64, 1, 10, -8, 1 }, + { -8, 1, 64, -8, 10, 1 }, + { 1, 10, -8, 64, 1, -8 }, + { 1, -8, 10, 1, 64, -8 }, + { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/configs.inc index 1eb9c578f9..a3ad3e22cf 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/configs.inc @@ -171,3 +171,5 @@ C Diagram 15 DATA (SPROP(I,-3,15),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/15/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f index c2eadb2c31..aa93a3d195 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f index 797b19405d..48e24320cc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -243,17 +240,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -323,7 +309,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -366,7 +352,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(9) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -409,43 +396,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /7.111111111111111D+00, - $ -8.888888888888888D-01,-8.888888888888888D-01 - $ ,1.111111111111111D-01,1.111111111111111D-01,1.111111111111111D - $ +00/ + DATA DENOM/9/ + DATA (CF(I),I= 1, 6) /64,-16,-16,2,2,20/ C 1 T(1,2,5,3,4) - DATA (CF(I, 2),I= 1, 6) /-8.888888888888888D-01 - $ ,7.111111111111111D+00,1.111111111111111D-01,1.111111111111111D - $ +00,-8.888888888888888D-01,1.111111111111111D-01/ + DATA (CF(I),I= 7, 11) /64,2,20,-16,2/ C 1 T(1,5,2,3,4) - DATA (CF(I, 3),I= 1, 6) /-8.888888888888888D-01 - $ ,1.111111111111111D-01,7.111111111111111D+00, - $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D - $ -01/ + DATA (CF(I),I= 12, 15) /64,-16,20,2/ C 1 T(2,1,5,3,4) - DATA (CF(I, 4),I= 1, 6) /1.111111111111111D-01 - $ ,1.111111111111111D+00,-8.888888888888888D-01 - $ ,7.111111111111111D+00,1.111111111111111D-01, - $ -8.888888888888888D-01/ + DATA (CF(I),I= 16, 18) /64,2,-16/ C 1 T(2,5,1,3,4) - DATA (CF(I, 5),I= 1, 6) /1.111111111111111D-01, - $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D - $ -01,7.111111111111111D+00,-8.888888888888888D-01/ + DATA (CF(I),I= 19, 20) /64,-16/ C 1 T(5,1,2,3,4) - DATA (CF(I, 6),I= 1, 6) /1.111111111111111D+00 - $ ,1.111111111111111D-01,1.111111111111111D-01, - $ -8.888888888888888D-01,-8.888888888888888D-01 - $ ,7.111111111111111D+00/ + DATA (CF(I),I= 21, 21) /64/ C 1 T(5,2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -549,10 +525,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -561,6 +539,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 4f8f49270b..c90527fa03 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -405,156 +461,43 @@ namespace mg5amcCpu jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gu_ttxu()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -610,7 +553,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -644,6 +591,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -685,6 +636,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -805,8 +760,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -814,25 +769,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -977,13 +1110,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -995,18 +1122,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1031,93 +1163,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1159,7 +1228,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1182,7 +1251,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1191,21 +1260,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1219,8 +1290,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1236,11 +1309,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1342,14 +1416,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index b501a9772e..2c0025c7b9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f index b0cc58e89c..340d51dbfa 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f index 2b281a8200..83f5f0b209 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE) U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE) @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -486,51 +490,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/configs.inc index 225cf5aca4..0a6b8dbc07 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/configs.inc @@ -57,3 +57,5 @@ C Diagram 5 DATA (SPROP(I,-3,5),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/5/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f index c2eadb2c31..aa93a3d195 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f index 9394a561b8..8aa675cd01 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -246,17 +243,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -326,7 +312,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -372,7 +358,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -415,31 +402,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,3,2) T(5,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,3,4) T(5,2) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,5,2) T(3,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,5,4) T(3,2) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -492,10 +476,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -504,6 +490,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index e2d65a2667..812f8dec18 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -405,156 +461,43 @@ namespace mg5amcCpu jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gux_ttxux()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -610,7 +553,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -644,6 +591,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -685,6 +636,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -805,8 +760,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -814,25 +769,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -977,13 +1110,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -995,18 +1122,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1031,93 +1163,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1159,7 +1228,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1182,7 +1251,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1191,21 +1260,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1219,8 +1290,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1236,11 +1309,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1342,14 +1416,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index d658e0394e..7a811e35e9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f index e36675626f..f9cde14dc2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f index 61bb13c3e7..136c6cded7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -486,51 +490,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/configs.inc index 693e4354b0..28a94fd35a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/configs.inc @@ -57,3 +57,5 @@ C Diagram 5 DATA (SPROP(I,-3,5),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/5/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f index c2eadb2c31..aa93a3d195 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f index c7fdad381b..f77432fcd1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -246,17 +243,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -326,7 +312,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -372,7 +358,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -415,31 +402,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,2,4) T(3,5) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,2,5) T(3,4) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,3,4) T(2,5) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,3,5) T(2,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -492,10 +476,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -504,6 +490,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index 4f41927bc9..e7e58d3385 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -405,156 +461,43 @@ namespace mg5amcCpu jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_uux_ttxg()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -610,7 +553,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -644,6 +591,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -685,6 +636,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -805,8 +760,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -814,25 +769,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -977,13 +1110,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -995,18 +1122,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1031,93 +1163,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1159,7 +1228,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1182,7 +1251,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1191,21 +1260,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1219,8 +1290,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1236,11 +1309,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1342,14 +1416,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h index ebf14aca9e..013d386f6c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f index d46dad4fcb..f43ba8ff39 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f index d8e94d91bb..76b1a9dd93 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -497,51 +501,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc new file mode 100644 index 0000000000..42eca2f7c9 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc @@ -0,0 +1,429 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/configs.inc index 897255fa04..907b407e8e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/configs.inc @@ -51,3 +51,5 @@ C Diagram 5 DATA TPRID(-2,5)/0/ C Number of configs DATA MAPCONFIG(0)/5/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f index c2eadb2c31..aa93a3d195 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f index 787dae76b2..7dc0b8e911 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -246,17 +243,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -326,7 +312,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -372,7 +358,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -415,31 +402,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(2,1) T(5,3,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(2,4) T(5,3,1) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(3,1) T(5,2,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(3,4) T(5,2,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -492,10 +476,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -504,6 +490,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index da962495fd..1721f42b1f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 24; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities -#endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) +#endif + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -2461,176 +2517,43 @@ namespace mg5amcCpu jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_gg_ttxgg()?) - - // The color denominators (initialize all array elements, with ncolor=24) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24] - - // The color matrix (initialize all array elements, with ncolor=24) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 }, - { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 }, - { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 }, - { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 }, - { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 }, - { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 }, - { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 }, - { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 }, - { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 }, - { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 }, - { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 }, - { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 }, - { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 }, - { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 }, - { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 }, - { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 }, - { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 }, - { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 }, - { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 }, - { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 }, - { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 }, - { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 }, - { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, - { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -2718,7 +2641,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -2753,6 +2680,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -2795,6 +2726,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -2915,8 +2850,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -2924,25 +2859,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -3087,13 +3200,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -3105,18 +3212,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -3141,93 +3253,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3269,7 +3318,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -3292,7 +3341,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -3301,21 +3350,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -3329,8 +3380,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -3346,11 +3399,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -3452,14 +3506,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h index b6e3ba16d4..65b3e1d2ac 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 123; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 24; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f index 850bc73f22..23a723f0df 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f index 7af9753fb7..1e6337aaac 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc new file mode 100644 index 0000000000..91a7f9998e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc @@ -0,0 +1,449 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=24) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24] + + // The color matrix (initialize all array elements, with ncolor=24) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 }, + { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 }, + { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 }, + { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 }, + { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 }, + { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 }, + { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 }, + { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 }, + { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 }, + { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 }, + { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 }, + { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 }, + { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 }, + { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 }, + { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 }, + { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 }, + { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 }, + { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 }, + { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 }, + { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 }, + { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 }, + { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 }, + { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, + { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/configs.inc index b50d3d5335..570419b5c0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/configs.inc @@ -1530,3 +1530,5 @@ C Diagram 105 DATA (SPROP(I,-4,105),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/105/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f index 39ecff768a..ec8440191f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -275,17 +272,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -355,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -398,7 +384,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(155) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -441,407 +428,81 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ - DATA (CF(I, 1),I= 7, 12) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 1),I= 13, 18) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ - DATA (CF(I, 1),I= 19, 24) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ + DATA DENOM/54/ + DATA (CF(I),I= 1, 24) /512,-128,-128,16,16,160,-128,16,16,-2,-2 + $ ,-20,16,-2,160,-20,142,124,-2,-20,-20,124,124,-56/ C 1 T(1,2,5,6,3,4) - DATA (CF(I, 2),I= 1, 6) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ - DATA (CF(I, 2),I= 7, 12) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 2),I= 13, 18) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ - DATA (CF(I, 2),I= 19, 24) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ + DATA (CF(I),I= 25, 47) /512,16,160,-128,16,16,-128,-2,-20,16,-2, + $ -2,-20,-20,124,124,-56,16,-2,160,-20,142,124/ C 1 T(1,2,6,5,3,4) - DATA (CF(I, 3),I= 1, 6) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ - DATA (CF(I, 3),I= 7, 12) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ - DATA (CF(I, 3),I= 13, 18) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 3),I= 19, 24) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ + DATA (CF(I),I= 48, 69) /512,-128,160,16,16,-2,160,-20,142,124, + $ -128,16,16,-2,-2,-20,-20,-2,124,-56,-20,124/ C 1 T(1,5,2,6,3,4) - DATA (CF(I, 4),I= 1, 6) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 4),I= 7, 12) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ - DATA (CF(I, 4),I= 13, 18) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 4),I= 19, 24) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ + DATA (CF(I),I= 70, 90) /512,16,-128,-2,-20,-20,124,124,-56,16, + $ -128,-2,-20,16,-2,-2,16,142,124,160,-20/ C 1 T(1,5,6,2,3,4) - DATA (CF(I, 5),I= 1, 6) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ - DATA (CF(I, 5),I= 7, 12) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ - DATA (CF(I, 5),I= 13, 18) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 5),I= 19, 24) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ + DATA (CF(I),I= 91,110) /512,-128,-2,16,142,124,160,-20,-20,-2 + $ ,124,-56,-20,124,-128,16,16,-2,-2,-20/ C 1 T(1,6,2,5,3,4) - DATA (CF(I, 6),I= 1, 6) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ - DATA (CF(I, 6),I= 7, 12) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 6),I= 13, 18) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ - DATA (CF(I, 6),I= 19, 24) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ + DATA (CF(I),I=111,129) /512,-20,-2,124,-56,-20,124,-2,16,142,124 + $ ,160,-20,16,-128,-2,-20,16,-2/ C 1 T(1,6,5,2,3,4) - DATA (CF(I, 7),I= 1, 6) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 7),I= 7, 12) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ - DATA (CF(I, 7),I= 13, 18) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ - DATA (CF(I, 7),I= 19, 24) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ + DATA (CF(I),I=130,147) /512,-128,-128,16,16,160,160,-20,16,-2 + $ ,124,142,-20,124,-2,-20,-56,124/ C 1 T(2,1,5,6,3,4) - DATA (CF(I, 8),I= 1, 6) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 8),I= 7, 12) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ - DATA (CF(I, 8),I= 13, 18) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 8),I= 19, 24) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ + DATA (CF(I),I=148,164) /512,16,160,-128,16,-20,124,-2,-20,-56 + $ ,124,160,-20,16,-2,124,142/ C 1 T(2,1,6,5,3,4) - DATA (CF(I, 9),I= 1, 6) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ - DATA (CF(I, 9),I= 7, 12) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ - DATA (CF(I, 9),I= 13, 18) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 9),I= 19, 24) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ + DATA (CF(I),I=165,180) /512,-128,160,16,16,-2,-128,16,-20,-2,124 + $ ,-56,-20,-2,124,-20/ C 1 T(2,5,1,6,3,4) - DATA (CF(I, 10),I= 1, 6) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ - DATA (CF(I, 10),I= 7, 12) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 10),I= 13, 18) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 10),I= 19, 24) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ + DATA (CF(I),I=181,195) /512,16,-128,-2,-20,16,-128,-2,16,142,124 + $ ,-2,16,-20,160/ C 1 T(2,5,6,1,3,4) - DATA (CF(I, 11),I= 1, 6) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ - DATA (CF(I, 11),I= 7, 12) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ - DATA (CF(I, 11),I= 13, 18) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ - DATA (CF(I, 11),I= 19, 24) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ + DATA (CF(I),I=196,209) /512,-128,124,-56,-20,-2,124,-20,16,-2, + $ -128,16,-20,-2/ C 1 T(2,6,1,5,3,4) - DATA (CF(I, 12),I= 1, 6) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 12),I= 7, 12) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ - DATA (CF(I, 12),I= 13, 18) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ - DATA (CF(I, 12),I= 19, 24) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ + DATA (CF(I),I=210,222) /512,142,124,-2,16,-20,160,-2,-20,16,-128 + $ ,-2,16/ C 1 T(2,6,5,1,3,4) - DATA (CF(I, 13),I= 1, 6) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 13),I= 7, 12) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ - DATA (CF(I, 13),I= 13, 18) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ - DATA (CF(I, 13),I= 19, 24) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ + DATA (CF(I),I=223,234) /512,-128,-128,16,16,160,124,-20,-56,124, + $ -2,-20/ C 1 T(5,1,2,6,3,4) - DATA (CF(I, 14),I= 1, 6) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 14),I= 7, 12) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 14),I= 13, 18) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ - DATA (CF(I, 14),I= 19, 24) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ + DATA (CF(I),I=235,245) /512,16,160,-128,16,-20,160,124,142,16,-2/ C 1 T(5,1,6,2,3,4) - DATA (CF(I, 15),I= 1, 6) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ - DATA (CF(I, 15),I= 7, 12) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 15),I= 13, 18) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ - DATA (CF(I, 15),I= 19, 24) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ + DATA (CF(I),I=246,255) /512,-128,160,16,-56,124,124,-20,-20,-2/ C 1 T(5,2,1,6,3,4) - DATA (CF(I, 16),I= 1, 6) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 16),I= 7, 12) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 16),I= 13, 18) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 16),I= 19, 24) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ + DATA (CF(I),I=256,264) /512,16,-128,124,142,-20,160,-2,16/ C 1 T(5,2,6,1,3,4) - DATA (CF(I, 17),I= 1, 6) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ - DATA (CF(I, 17),I= 7, 12) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ - DATA (CF(I, 17),I= 13, 18) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ - DATA (CF(I, 17),I= 19, 24) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ + DATA (CF(I),I=265,272) /512,-128,-2,16,-20,-2,-128,16/ C 1 T(5,6,1,2,3,4) - DATA (CF(I, 18),I= 1, 6) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ - DATA (CF(I, 18),I= 7, 12) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ - DATA (CF(I, 18),I= 13, 18) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ - DATA (CF(I, 18),I= 19, 24) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ + DATA (CF(I),I=273,279) /512,-20,-2,-2,16,16,-128/ C 1 T(5,6,2,1,3,4) - DATA (CF(I, 19),I= 1, 6) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ - DATA (CF(I, 19),I= 7, 12) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ - DATA (CF(I, 19),I= 13, 18) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 19),I= 19, 24) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ + DATA (CF(I),I=280,285) /512,-128,-128,16,16,160/ C 1 T(6,1,2,5,3,4) - DATA (CF(I, 20),I= 1, 6) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 20),I= 7, 12) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 20),I= 13, 18) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ - DATA (CF(I, 20),I= 19, 24) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ + DATA (CF(I),I=286,290) /512,16,160,-128,16/ C 1 T(6,1,5,2,3,4) - DATA (CF(I, 21),I= 1, 6) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ - DATA (CF(I, 21),I= 7, 12) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ - DATA (CF(I, 21),I= 13, 18) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 21),I= 19, 24) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ + DATA (CF(I),I=291,294) /512,-128,160,16/ C 1 T(6,2,1,5,3,4) - DATA (CF(I, 22),I= 1, 6) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 22),I= 7, 12) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 22),I= 13, 18) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 22),I= 19, 24) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ + DATA (CF(I),I=295,297) /512,16,-128/ C 1 T(6,2,5,1,3,4) - DATA (CF(I, 23),I= 1, 6) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 23),I= 7, 12) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 23),I= 13, 18) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ - DATA (CF(I, 23),I= 19, 24) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ + DATA (CF(I),I=298,299) /512,-128/ C 1 T(6,5,1,2,3,4) - DATA (CF(I, 24),I= 1, 6) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 24),I= 7, 12) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 24),I= 13, 18) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 24),I= 19, 24) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ + DATA (CF(I),I=300,300) /512/ C 1 T(6,5,2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -1547,10 +1208,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -1559,6 +1222,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(2)=AMP2(2)+AMP(4)*DCONJG(AMP(4)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index 70d0f7cb8e..c8b71f5ba4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 12; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities -#endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) +#endif + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -868,164 +924,43 @@ namespace mg5amcCpu jamp_sv[9] += 1. / 2. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_gg_ttxuux()?) - - // The color denominators (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] - - // The color matrix (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 }, - { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 }, - { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 }, - { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 }, - { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 }, - { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 }, - { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 }, - { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 }, - { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 }, - { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 }, - { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, - { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -1113,7 +1048,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -1148,6 +1087,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -1190,6 +1133,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -1310,8 +1257,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1319,25 +1266,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1482,13 +1607,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1500,18 +1619,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1536,93 +1660,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1664,7 +1725,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1687,7 +1748,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1696,21 +1757,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1724,8 +1787,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1741,11 +1806,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1847,14 +1913,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h index 84a8066974..d142f229d3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f index 49cac7230f..daa43b594b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f index 6e1c3f774f..747f5861c7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -305,6 +305,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -388,12 +392,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -475,51 +479,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc new file mode 100644 index 0000000000..767405ac3b --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc @@ -0,0 +1,437 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] + + // The color matrix (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 }, + { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 }, + { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 }, + { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 }, + { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 }, + { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 }, + { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 }, + { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 }, + { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 }, + { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 }, + { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, + { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/configs.inc index d6f8bae63a..0fcb4cf404 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/configs.inc @@ -510,3 +510,5 @@ C Diagram 35 DATA (SPROP(I,-4,35),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/35/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f index 9fb8f4d180..80e3731885 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(17) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,111 +434,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /1.600000000000000D+01 - $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D - $ +00,0.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 1),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 12) /48,32,32,12,0,32,-4,0,-12,-4,-4,12/ C 1 T(1,2,3,4) T(5,6) - DATA (CF(I, 2),I= 1, 6) /5.333333333333333D+00 - $ ,1.600000000000000D+01,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,0.000000000000000D+00/ - DATA (CF(I, 2),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 13, 23) /48,12,32,32,0,0,-4,-4,-12,12,-4/ C 1 T(1,2,3,6) T(5,4) - DATA (CF(I, 3),I= 1, 6) /5.333333333333333D+00 - $ ,2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ - DATA (CF(I, 3),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,2.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 24, 33) /48,32,-4,0,0,32,-4,12,-12,-4/ C 1 T(1,2,5,4) T(3,6) - DATA (CF(I, 4),I= 1, 6) /2.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D - $ +01,0.000000000000000D+00,-6.666666666666666D-01/ - DATA (CF(I, 4),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00/ + DATA (CF(I),I= 34, 42) /48,0,-4,32,0,12,-4,-4,-12/ C 1 T(1,2,5,6) T(3,4) - DATA (CF(I, 5),I= 1, 6) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00/ - DATA (CF(I, 5),I= 7, 12) /5.333333333333333D+00 - $ ,2.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ + DATA (CF(I),I= 43, 50) /48,32,32,12,0,-4,32,0/ C 1 T(1,3,4) T(2,5,6) - DATA (CF(I, 6),I= 1, 6) /5.333333333333333D+00 - $ ,0.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D - $ +01/ - DATA (CF(I, 6),I= 7, 12) /2.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ + DATA (CF(I),I= 51, 57) /48,12,32,-4,0,0,32/ C 1 T(1,3,6) T(2,5,4) - DATA (CF(I, 7),I= 1, 6) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ - DATA (CF(I, 7),I= 7, 12) /1.600000000000000D+01 - $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ + DATA (CF(I),I= 58, 63) /48,32,32,0,0,-4/ C 1 T(1,5,4) T(2,3,6) - DATA (CF(I, 8),I= 1, 6) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00,2.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 8),I= 7, 12) /5.333333333333333D+00 - $ ,1.600000000000000D+01,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ + DATA (CF(I),I= 64, 68) /48,0,32,-4,0/ C 1 T(1,5,6) T(2,3,4) - DATA (CF(I, 9),I= 1, 6) /-2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 9),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ + DATA (CF(I),I= 69, 72) /48,32,32,12/ C 1 T(2,1,3,4) T(5,6) - DATA (CF(I, 10),I= 1, 6) /-6.666666666666666D-01, - $ -2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,0.000000000000000D+00/ - DATA (CF(I, 10),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D - $ +01,2.000000000000000D+00,5.333333333333333D+00/ + DATA (CF(I),I= 73, 75) /48,12,32/ C 1 T(2,1,3,6) T(5,4) - DATA (CF(I, 11),I= 1, 6) /-6.666666666666666D-01 - $ ,2.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ - DATA (CF(I, 11),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,2.000000000000000D - $ +00,1.600000000000000D+01,5.333333333333333D+00/ + DATA (CF(I),I= 76, 77) /48,32/ C 1 T(2,1,5,4) T(3,6) - DATA (CF(I, 12),I= 1, 6) /2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ - DATA (CF(I, 12),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,1.600000000000000D+01/ + DATA (CF(I),I= 78, 78) /48/ C 1 T(2,1,5,6) T(3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -760,10 +680,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -772,6 +694,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index ac4bf091b7..a7827dbfab 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 12; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities -#endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) +#endif + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -868,164 +924,43 @@ namespace mg5amcCpu jamp_sv[10] -= 1. / 2. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_gu_ttxgu()?) - - // The color denominators (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] - - // The color matrix (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 48, 16, 0, 16, -2, 0, 16, 6, 0, 16, 0, -2 }, - { 16, 48, 16, 0, 0, -2, 6, 16, 16, 0, -2, 0 }, - { 0, 16, 48, 16, 16, 6, -2, 0, 6, -2, -6, -2 }, - { 16, 0, 16, 48, 6, 16, 0, -2, -2, 6, -2, -6 }, - { -2, 0, 16, 6, 48, 16, 0, 16, -2, -6, -2, 6 }, - { 0, -2, 6, 16, 16, 48, 16, 0, -6, -2, 6, -2 }, - { 16, 6, -2, 0, 0, 16, 48, 16, -2, 0, 16, 0 }, - { 6, 16, 0, -2, 16, 0, 16, 48, 0, -2, 0, 16 }, - { 0, 16, 6, -2, -2, -6, -2, 0, 48, 16, 6, 16 }, - { 16, 0, -2, 6, -6, -2, 0, -2, 16, 48, 16, 6 }, - { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 }, - { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -1113,7 +1048,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -1148,6 +1087,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -1190,6 +1133,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -1310,8 +1257,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1319,25 +1266,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1482,13 +1607,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1500,18 +1619,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1536,93 +1660,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1664,7 +1725,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1687,7 +1748,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1696,21 +1757,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1724,8 +1787,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1741,11 +1806,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1847,14 +1913,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h index f75309f403..35718f5b21 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f index 47e378e255..e363f036a8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f index 756e98881c..7481a1ea65 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE) U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE) @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -486,51 +490,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc new file mode 100644 index 0000000000..db09ae848e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc @@ -0,0 +1,437 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] + + // The color matrix (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 48, 16, 0, 16, -2, 0, 16, 6, 0, 16, 0, -2 }, + { 16, 48, 16, 0, 0, -2, 6, 16, 16, 0, -2, 0 }, + { 0, 16, 48, 16, 16, 6, -2, 0, 6, -2, -6, -2 }, + { 16, 0, 16, 48, 6, 16, 0, -2, -2, 6, -2, -6 }, + { -2, 0, 16, 6, 48, 16, 0, 16, -2, -6, -2, 6 }, + { 0, -2, 6, 16, 16, 48, 16, 0, -6, -2, 6, -2 }, + { 16, 6, -2, 0, 0, 16, 48, 16, -2, 0, 16, 0 }, + { 6, 16, 0, -2, 16, 0, 16, 48, 0, -2, 0, 16 }, + { 0, 16, 6, -2, -2, -6, -2, 0, 48, 16, 6, 16 }, + { 16, 0, -2, 6, -6, -2, 0, -2, 16, 48, 16, 6 }, + { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 }, + { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/configs.inc index b2af8a7144..6b1cf30883 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/configs.inc @@ -510,3 +510,5 @@ C Diagram 35 DATA (SPROP(I,-4,35),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/35/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f index 0079f40417..099c6ca7c5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(17) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,111 +434,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /1.600000000000000D+01 - $ ,5.333333333333333D+00,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ - DATA (CF(I, 1),I= 7, 12) /5.333333333333333D+00 - $ ,2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 12) /48,32,0,32,-4,0,32,12,0,32,0,-4/ C 1 T(1,3,2) T(5,6,4) - DATA (CF(I, 2),I= 1, 6) /5.333333333333333D+00 - $ ,1.600000000000000D+01,5.333333333333333D+00,0.000000000000000D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ - DATA (CF(I, 2),I= 7, 12) /2.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ + DATA (CF(I),I= 13, 23) /48,32,0,0,-4,12,32,32,0,-4,0/ C 1 T(1,3,4) T(5,6,2) - DATA (CF(I, 3),I= 1, 6) /0.000000000000000D+00 - $ ,5.333333333333333D+00,1.600000000000000D+01,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ - DATA (CF(I, 3),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 24, 33) /48,32,32,12,-4,0,12,-4,-12,-4/ C 1 T(1,5,3,2) T(6,4) - DATA (CF(I, 4),I= 1, 6) /5.333333333333333D+00 - $ ,0.000000000000000D+00,5.333333333333333D+00,1.600000000000000D - $ +01,2.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 4),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00,-6.666666666666666D-01, - $ -2.000000000000000D+00/ + DATA (CF(I),I= 34, 42) /48,12,32,0,-4,-4,12,-4,-12/ C 1 T(1,5,3,4) T(6,2) - DATA (CF(I, 5),I= 1, 6) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,5.333333333333333D+00,2.000000000000000D - $ +00,1.600000000000000D+01,5.333333333333333D+00/ - DATA (CF(I, 5),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01, - $ -2.000000000000000D+00,-6.666666666666666D-01 - $ ,2.000000000000000D+00/ + DATA (CF(I),I= 43, 50) /48,32,0,32,-4,-12,-4,12/ C 1 T(1,5,6,2) T(3,4) - DATA (CF(I, 6),I= 1, 6) /0.000000000000000D+00, - $ -6.666666666666666D-01,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,1.600000000000000D+01/ - DATA (CF(I, 6),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01,2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 51, 57) /48,32,0,-12,-4,12,-4/ C 1 T(1,5,6,4) T(3,2) - DATA (CF(I, 7),I= 1, 6) /5.333333333333333D+00 - $ ,2.000000000000000D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ - DATA (CF(I, 7),I= 7, 12) /1.600000000000000D+01 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,5.333333333333333D+00,0.000000000000000D - $ +00/ + DATA (CF(I),I= 58, 63) /48,32,-4,0,32,0/ C 1 T(1,6,2) T(5,3,4) - DATA (CF(I, 8),I= 1, 6) /2.000000000000000D+00 - $ ,5.333333333333333D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ - DATA (CF(I, 8),I= 7, 12) /5.333333333333333D+00 - $ ,1.600000000000000D+01,0.000000000000000D+00, - $ -6.666666666666666D-01,0.000000000000000D+00,5.333333333333333D - $ +00/ + DATA (CF(I),I= 64, 68) /48,0,-4,0,32/ C 1 T(1,6,4) T(5,3,2) - DATA (CF(I, 9),I= 1, 6) /0.000000000000000D+00 - $ ,5.333333333333333D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00/ - DATA (CF(I, 9),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,2.000000000000000D+00,5.333333333333333D+00/ + DATA (CF(I),I= 69, 72) /48,32,12,32/ C 1 T(3,2) T(5,1,6,4) - DATA (CF(I, 10),I= 1, 6) /5.333333333333333D+00 - $ ,0.000000000000000D+00,-6.666666666666666D-01 - $ ,2.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 10),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D - $ +01,5.333333333333333D+00,2.000000000000000D+00/ + DATA (CF(I),I= 73, 75) /48,32,12/ C 1 T(3,4) T(5,1,6,2) - DATA (CF(I, 11),I= 1, 6) /0.000000000000000D+00, - $ -6.666666666666666D-01,-2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00/ - DATA (CF(I, 11),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D - $ +00,1.600000000000000D+01,5.333333333333333D+00/ + DATA (CF(I),I= 76, 77) /48,32/ C 1 T(5,1,3,2) T(6,4) - DATA (CF(I, 12),I= 1, 6) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,-6.666666666666666D-01, - $ -2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 12),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D - $ +00,5.333333333333333D+00,1.600000000000000D+01/ + DATA (CF(I),I= 78, 78) /48/ C 1 T(5,1,3,4) T(6,2) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -761,10 +681,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -773,6 +695,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index acf1b836af..9e03e92989 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 12; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities -#endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) +#endif + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -868,164 +924,43 @@ namespace mg5amcCpu jamp_sv[8] += 1. / 2. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_gux_ttxgux()?) - - // The color denominators (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] - - // The color matrix (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 48, 16, 16, 6, 0, 16, -2, 0, 0, 16, -2, 0 }, - { 16, 48, 6, 16, 16, 0, 0, -2, 16, 0, 0, -2 }, - { 16, 6, 48, 16, -2, 0, 0, 16, -2, 0, 0, 16 }, - { 6, 16, 16, 48, 0, -2, 16, 0, 0, -2, 16, 0 }, - { 0, 16, -2, 0, 48, 16, 16, 6, 6, -2, -2, -6 }, - { 16, 0, 0, -2, 16, 48, 6, 16, -2, 6, -6, -2 }, - { -2, 0, 0, 16, 16, 6, 48, 16, -2, -6, 6, -2 }, - { 0, -2, 16, 0, 6, 16, 16, 48, -6, -2, -2, 6 }, - { 0, 16, -2, 0, 6, -2, -2, -6, 48, 16, 16, 6 }, - { 16, 0, 0, -2, -2, 6, -6, -2, 16, 48, 6, 16 }, - { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 }, - { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -1113,7 +1048,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -1148,6 +1087,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -1190,6 +1133,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -1310,8 +1257,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1319,25 +1266,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1482,13 +1607,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1500,18 +1619,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1536,93 +1660,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1664,7 +1725,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1687,7 +1748,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1696,21 +1757,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1724,8 +1787,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1741,11 +1806,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1847,14 +1913,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h index 531d6bcd03..c1177b083f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f index f13f023e7d..3b0621c453 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f index a59705bfaf..0a0d60ea62 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -486,51 +490,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc new file mode 100644 index 0000000000..13c347c712 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc @@ -0,0 +1,437 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] + + // The color matrix (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 48, 16, 16, 6, 0, 16, -2, 0, 0, 16, -2, 0 }, + { 16, 48, 6, 16, 16, 0, 0, -2, 16, 0, 0, -2 }, + { 16, 6, 48, 16, -2, 0, 0, 16, -2, 0, 0, 16 }, + { 6, 16, 16, 48, 0, -2, 16, 0, 0, -2, 16, 0 }, + { 0, 16, -2, 0, 48, 16, 16, 6, 6, -2, -2, -6 }, + { 16, 0, 0, -2, 16, 48, 6, 16, -2, 6, -6, -2 }, + { -2, 0, 0, 16, 16, 6, 48, 16, -2, -6, 6, -2 }, + { 0, -2, 16, 0, 6, 16, 16, 48, -6, -2, -2, 6 }, + { 0, 16, -2, 0, 6, -2, -2, -6, 48, 16, 16, 6 }, + { 16, 0, 0, -2, -2, 6, -6, -2, 16, 48, 6, 16 }, + { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 }, + { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/configs.inc index e6e67b9933..bdaa2e8a30 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/configs.inc @@ -510,3 +510,5 @@ C Diagram 35 DATA (SPROP(I,-4,35),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/35/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f index 7cd8b962cc..980fe65932 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(17) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,109 +434,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /1.600000000000000D+01 - $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D - $ +00,0.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 1),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 12) /48,32,32,12,0,32,-4,0,0,32,-4,0/ C 1 T(1,2,4) T(5,3,6) - DATA (CF(I, 2),I= 1, 6) /5.333333333333333D+00 - $ ,1.600000000000000D+01,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,0.000000000000000D+00/ - DATA (CF(I, 2),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ + DATA (CF(I),I= 13, 23) /48,12,32,32,0,0,-4,32,0,0,-4/ C 1 T(1,2,6) T(5,3,4) - DATA (CF(I, 3),I= 1, 6) /5.333333333333333D+00 - $ ,2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ - DATA (CF(I, 3),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ + DATA (CF(I),I= 24, 33) /48,32,-4,0,0,32,-4,0,0,32/ C 1 T(1,3,4) T(5,2,6) - DATA (CF(I, 4),I= 1, 6) /2.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D - $ +01,0.000000000000000D+00,-6.666666666666666D-01/ - DATA (CF(I, 4),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ + DATA (CF(I),I= 34, 42) /48,0,-4,32,0,0,-4,32,0/ C 1 T(1,3,6) T(5,2,4) - DATA (CF(I, 5),I= 1, 6) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00/ - DATA (CF(I, 5),I= 7, 12) /5.333333333333333D+00 - $ ,2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00/ + DATA (CF(I),I= 43, 50) /48,32,32,12,12,-4,-4,-12/ C 1 T(1,5,2,4) T(3,6) - DATA (CF(I, 6),I= 1, 6) /5.333333333333333D+00 - $ ,0.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D - $ +01/ - DATA (CF(I, 6),I= 7, 12) /2.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,2.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 51, 57) /48,12,32,-4,12,-12,-4/ C 1 T(1,5,2,6) T(3,4) - DATA (CF(I, 7),I= 1, 6) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ - DATA (CF(I, 7),I= 7, 12) /1.600000000000000D+01 - $ ,5.333333333333333D+00,-6.666666666666666D-01, - $ -2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 58, 63) /48,32,-4,-12,12,-4/ C 1 T(1,5,3,4) T(2,6) - DATA (CF(I, 8),I= 1, 6) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00,2.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 8),I= 7, 12) /5.333333333333333D+00 - $ ,1.600000000000000D+01,-2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00/ + DATA (CF(I),I= 64, 68) /48,-12,-4,-4,12/ C 1 T(1,5,3,6) T(2,4) - DATA (CF(I, 9),I= 1, 6) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 9),I= 7, 12) /-6.666666666666666D-01, - $ -2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ + DATA (CF(I),I= 69, 72) /48,32,32,12/ C 1 T(2,4) T(5,1,3,6) - DATA (CF(I, 10),I= 1, 6) /5.333333333333333D+00 - $ ,0.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00/ - DATA (CF(I, 10),I= 7, 12) /-2.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D - $ +01,2.000000000000000D+00,5.333333333333333D+00/ + DATA (CF(I),I= 73, 75) /48,12,32/ C 1 T(2,6) T(5,1,3,4) - DATA (CF(I, 11),I= 1, 6) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,-2.000000000000000D+00/ - DATA (CF(I, 11),I= 7, 12) /2.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,2.000000000000000D - $ +00,1.600000000000000D+01,5.333333333333333D+00/ + DATA (CF(I),I= 76, 77) /48,32/ C 1 T(3,4) T(5,1,2,6) - DATA (CF(I, 12),I= 1, 6) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00,-2.000000000000000D+00,-6.666666666666666D-01/ - DATA (CF(I, 12),I= 7, 12) /-6.666666666666666D-01 - $ ,2.000000000000000D+00,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,1.600000000000000D+01/ + DATA (CF(I),I= 78, 78) /48/ C 1 T(3,6) T(5,1,2,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -758,10 +680,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -770,6 +694,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index d34888db6a..b03275381b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,9 +103,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -111,10 +114,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -173,43 +173,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -221,7 +277,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -230,14 +285,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -263,14 +321,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -294,7 +348,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -308,7 +361,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -319,6 +371,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -449,158 +505,43 @@ namespace mg5amcCpu jamp_sv[5] -= 1. / 12. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uc_ttxuc()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -688,7 +629,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -723,6 +668,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -765,6 +714,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -885,8 +838,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -894,25 +847,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1057,13 +1188,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1075,18 +1200,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1111,93 +1241,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1239,7 +1306,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1262,7 +1329,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1271,21 +1338,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1299,8 +1368,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1316,11 +1387,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1422,14 +1494,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h index 08510dfc85..0b88c815d2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -80,6 +81,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -127,7 +129,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -135,9 +137,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -157,34 +161,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f index bb9d2c55fb..5eb74ead8d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f index b76b7c4456..71844a31af 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -142,7 +142,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -151,7 +151,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE) S2=PDG2PDF(LPP(IB(2)),3, IB(2),XBK(IB(2)), QSCALE) @@ -243,7 +243,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -321,6 +321,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -406,20 +410,20 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -513,51 +517,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc new file mode 100644 index 0000000000..a1e583992a --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc @@ -0,0 +1,431 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/configs.inc index ddb1d6a390..eb5fc269e9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/configs.inc @@ -105,3 +105,5 @@ C Diagram 7 DATA (SPROP(I,-4,7),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/7/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f index bfe665d186..c79c6062f4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -76,10 +76,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -280,17 +277,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -360,7 +346,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -408,7 +394,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(8) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -451,39 +438,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(3,1) T(5,2) T(6,4) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(3,1) T(5,4) T(6,2) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(3,2) T(5,1) T(6,4) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(3,2) T(5,4) T(6,1) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(3,4) T(5,1) T(6,2) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(3,4) T(5,2) T(6,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -553,10 +533,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -565,6 +547,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index 66e4b80f71..aa721caff8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -107,9 +109,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -117,10 +120,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -179,43 +179,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -227,7 +283,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -236,14 +291,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -269,14 +327,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -300,7 +354,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -314,7 +367,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -325,6 +377,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -455,158 +511,43 @@ namespace mg5amcCpu jamp_sv[5] -= 1. / 36. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_ucx_ttxucx()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -694,7 +635,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -729,6 +674,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -771,6 +720,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -891,8 +844,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -900,25 +853,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1063,13 +1194,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1081,18 +1206,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1117,93 +1247,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1245,7 +1312,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1268,7 +1335,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1277,21 +1344,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1305,8 +1374,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1322,11 +1393,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1428,14 +1500,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h index 04b9f5bcb1..fda9e102a7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -86,6 +87,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -133,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -141,9 +143,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -163,34 +167,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f index 5046df7e56..4f2b282d2a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f index 848991a32a..071faddb9b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -148,7 +148,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -158,7 +158,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -269,7 +269,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -353,6 +353,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -438,24 +442,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -585,51 +589,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc new file mode 100644 index 0000000000..a1e583992a --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc @@ -0,0 +1,431 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/configs.inc index 6da72c9bac..1a0a5a720d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/configs.inc @@ -105,3 +105,5 @@ C Diagram 7 DATA (SPROP(I,-4,7),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/7/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f index 5dcb5155f3..f6321517c1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -82,10 +82,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -286,17 +283,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -366,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -420,7 +406,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(8) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -463,39 +450,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(2,1) T(3,4) T(5,6) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(2,1) T(3,6) T(5,4) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(2,4) T(3,1) T(5,6) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(2,4) T(3,6) T(5,1) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(2,6) T(3,1) T(5,4) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(2,6) T(3,4) T(5,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -565,10 +545,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -577,6 +559,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index 8d266e82b7..8703f64023 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -559,158 +615,43 @@ namespace mg5amcCpu jamp_sv[5] -= 1. / 12. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uu_ttxuu()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -798,7 +739,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -833,6 +778,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -875,6 +824,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -995,8 +948,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1004,25 +957,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1167,13 +1298,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1185,18 +1310,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1221,93 +1351,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1349,7 +1416,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1372,7 +1439,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1381,21 +1448,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1409,8 +1478,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1426,11 +1497,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1532,14 +1604,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h index fd123d932d..1b094c86d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 14; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f index 77164138e6..286e0ec15a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f index f03c7f3b0c..4e96309281 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE) U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE) @@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -497,51 +501,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc new file mode 100644 index 0000000000..a1e583992a --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc @@ -0,0 +1,431 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/configs.inc index ab6edc7392..c4057663aa 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/configs.inc @@ -210,3 +210,5 @@ C Diagram 14 DATA (SPROP(I,-4,14),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/14/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f index 8b80833180..d9f5b54ed3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(16) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,39 +434,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(3,1) T(5,2) T(6,4) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(3,1) T(5,4) T(6,2) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(3,2) T(5,1) T(6,4) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(3,2) T(5,4) T(6,1) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(3,4) T(5,1) T(6,2) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(3,4) T(5,2) T(6,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -585,10 +565,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -597,6 +579,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index 1b918bae84..6ffd72682d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -107,9 +109,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -117,10 +120,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -179,43 +179,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -227,7 +283,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -236,14 +291,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -269,14 +327,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -300,7 +354,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -314,7 +367,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -325,6 +377,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -455,158 +511,43 @@ namespace mg5amcCpu jamp_sv[5] -= 1. / 12. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uux_ttxccx()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -694,7 +635,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -729,6 +674,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -771,6 +720,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -891,8 +844,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -900,25 +853,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1063,13 +1194,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1081,18 +1206,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1117,93 +1247,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1245,7 +1312,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1268,7 +1335,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1277,21 +1344,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1305,8 +1374,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1322,11 +1393,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1428,14 +1500,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h index 87faf25dfb..8fc89f1eaf 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -86,6 +87,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -133,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -141,9 +143,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -163,34 +167,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f index e3f26606a1..7bbb5f78a7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f index 74f009d272..b302e0aabb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -148,7 +148,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -158,7 +158,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -269,7 +269,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -353,6 +353,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -438,24 +442,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -585,51 +589,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc new file mode 100644 index 0000000000..a1e583992a --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc @@ -0,0 +1,431 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/configs.inc index 4cc87fa0bb..ae09f753a1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/configs.inc @@ -90,3 +90,5 @@ C Diagram 7 DATA (SPROP(I,-4,7),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/7/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f index 728711155f..46ce392684 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -82,10 +82,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -286,17 +283,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -366,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -420,7 +406,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(8) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -463,39 +450,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(2,1) T(3,4) T(5,6) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(2,1) T(3,6) T(5,4) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(2,4) T(3,1) T(5,6) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(2,4) T(3,6) T(5,1) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(2,6) T(3,1) T(5,4) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(2,6) T(3,4) T(5,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -565,10 +545,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -577,6 +559,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index 1c575b7757..7c3e3fa6c7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 12; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities -#endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) +#endif + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -868,164 +924,43 @@ namespace mg5amcCpu jamp_sv[10] += 1. / 2. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uux_ttxgg()?) - - // The color denominators (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] - - // The color matrix (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 48, -6, 16, -2, 16, -2, 6, 6, 0, -2, 16, 0 }, - { -6, 48, -2, 16, -2, 16, 6, 6, 0, 16, -2, 0 }, - { 16, -2, 48, -6, 6, 6, 16, -2, -2, 0, 0, 16 }, - { -2, 16, -6, 48, 6, 6, -2, 16, 16, 0, 0, -2 }, - { 16, -2, 6, 6, 48, -6, 16, -2, 16, 0, 0, -2 }, - { -2, 16, 6, 6, -6, 48, -2, 16, -2, 0, 0, 16 }, - { 6, 6, 16, -2, 16, -2, 48, -6, 0, 16, -2, 0 }, - { 6, 6, -2, 16, -2, 16, -6, 48, 0, -2, 16, 0 }, - { 0, 0, -2, 16, 16, -2, 0, 0, 48, 16, 16, 6 }, - { -2, 16, 0, 0, 0, 0, 16, -2, 16, 48, 6, 16 }, - { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 }, - { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -1113,7 +1048,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -1148,6 +1087,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -1190,6 +1133,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -1310,8 +1257,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1319,25 +1266,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1482,13 +1607,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1500,18 +1619,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1536,93 +1660,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1664,7 +1725,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1687,7 +1748,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1696,21 +1757,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1724,8 +1787,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1741,11 +1806,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1847,14 +1913,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h index 0689624568..b52ac7b6b3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f index 5787ba42b2..91d2c20b98 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f index 75d947b792..ede7c99981 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -497,51 +501,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc new file mode 100644 index 0000000000..82ceb3958f --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc @@ -0,0 +1,437 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] + + // The color matrix (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 48, -6, 16, -2, 16, -2, 6, 6, 0, -2, 16, 0 }, + { -6, 48, -2, 16, -2, 16, 6, 6, 0, 16, -2, 0 }, + { 16, -2, 48, -6, 6, 6, 16, -2, -2, 0, 0, 16 }, + { -2, 16, -6, 48, 6, 6, -2, 16, 16, 0, 0, -2 }, + { 16, -2, 6, 6, 48, -6, 16, -2, 16, 0, 0, -2 }, + { -2, 16, 6, 6, -6, 48, -2, 16, -2, 0, 0, 16 }, + { 6, 6, 16, -2, 16, -2, 48, -6, 0, 16, -2, 0 }, + { 6, 6, -2, 16, -2, 16, -6, 48, 0, -2, 16, 0 }, + { 0, 0, -2, 16, 16, -2, 0, 0, 48, 16, 16, 6 }, + { -2, 16, 0, 0, 0, 0, 16, -2, 16, 48, 6, 16 }, + { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 }, + { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/configs.inc index e246a996f4..9e83bfd791 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/configs.inc @@ -480,3 +480,5 @@ C Diagram 35 DATA TPRID(-3,35)/0/ C Number of configs DATA MAPCONFIG(0)/35/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f index 65c377ffc0..859d368c2b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(17) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,111 +434,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /1.600000000000000D+01, - $ -2.000000000000000D+00,5.333333333333333D+00, - $ -6.666666666666666D-01,5.333333333333333D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 1),I= 7, 12) /2.000000000000000D+00 - $ ,2.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 12) /48,-12,32,-4,32,-4,12,12,0,-4,32,0/ C 1 T(2,1) T(5,6,3,4) - DATA (CF(I, 2),I= 1, 6) /-2.000000000000000D+00 - $ ,1.600000000000000D+01,-6.666666666666666D-01 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,5.333333333333333D+00/ - DATA (CF(I, 2),I= 7, 12) /2.000000000000000D+00 - $ ,2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ + DATA (CF(I),I= 13, 23) /48,-4,32,-4,32,12,12,0,32,-4,0/ C 1 T(2,1) T(6,5,3,4) - DATA (CF(I, 3),I= 1, 6) /5.333333333333333D+00, - $ -6.666666666666666D-01,1.600000000000000D+01, - $ -2.000000000000000D+00,2.000000000000000D+00,2.000000000000000D - $ +00/ - DATA (CF(I, 3),I= 7, 12) /5.333333333333333D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ + DATA (CF(I),I= 24, 33) /48,-12,12,12,32,-4,-4,0,0,32/ C 1 T(2,4) T(5,6,3,1) - DATA (CF(I, 4),I= 1, 6) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,-2.000000000000000D+00 - $ ,1.600000000000000D+01,2.000000000000000D+00,2.000000000000000D - $ +00/ - DATA (CF(I, 4),I= 7, 12) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ + DATA (CF(I),I= 34, 42) /48,12,12,-4,32,32,0,0,-4/ C 1 T(2,4) T(6,5,3,1) - DATA (CF(I, 5),I= 1, 6) /5.333333333333333D+00, - $ -6.666666666666666D-01,2.000000000000000D+00,2.000000000000000D - $ +00,1.600000000000000D+01,-2.000000000000000D+00/ - DATA (CF(I, 5),I= 7, 12) /5.333333333333333D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ + DATA (CF(I),I= 43, 50) /48,-12,32,-4,32,0,0,-4/ C 1 T(3,1) T(5,6,2,4) - DATA (CF(I, 6),I= 1, 6) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,2.000000000000000D+00,2.000000000000000D - $ +00,-2.000000000000000D+00,1.600000000000000D+01/ - DATA (CF(I, 6),I= 7, 12) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ + DATA (CF(I),I= 51, 57) /48,-4,32,-4,0,0,32/ C 1 T(3,1) T(6,5,2,4) - DATA (CF(I, 7),I= 1, 6) /2.000000000000000D+00 - $ ,2.000000000000000D+00,5.333333333333333D+00, - $ -6.666666666666666D-01,5.333333333333333D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 7),I= 7, 12) /1.600000000000000D+01, - $ -2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ + DATA (CF(I),I= 58, 63) /48,-12,0,32,-4,0/ C 1 T(3,4) T(5,6,2,1) - DATA (CF(I, 8),I= 1, 6) /2.000000000000000D+00 - $ ,2.000000000000000D+00,-6.666666666666666D-01 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,5.333333333333333D+00/ - DATA (CF(I, 8),I= 7, 12) /-2.000000000000000D+00 - $ ,1.600000000000000D+01,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ + DATA (CF(I),I= 64, 68) /48,0,-4,32,0/ C 1 T(3,4) T(6,5,2,1) - DATA (CF(I, 9),I= 1, 6) /0.000000000000000D+00 - $ ,0.000000000000000D+00,-6.666666666666666D-01 - $ ,5.333333333333333D+00,5.333333333333333D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 9),I= 7, 12) /0.000000000000000D+00 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ + DATA (CF(I),I= 69, 72) /48,32,32,12/ C 1 T(5,2,1) T(6,3,4) - DATA (CF(I, 10),I= 1, 6) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,0.000000000000000D+00,0.000000000000000D - $ +00,0.000000000000000D+00,0.000000000000000D+00/ - DATA (CF(I, 10),I= 7, 12) /5.333333333333333D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D - $ +01,2.000000000000000D+00,5.333333333333333D+00/ + DATA (CF(I),I= 73, 75) /48,12,32/ C 1 T(5,2,4) T(6,3,1) - DATA (CF(I, 11),I= 1, 6) /5.333333333333333D+00, - $ -6.666666666666666D-01,0.000000000000000D+00,0.000000000000000D - $ +00,0.000000000000000D+00,0.000000000000000D+00/ - DATA (CF(I, 11),I= 7, 12) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D - $ +00,1.600000000000000D+01,5.333333333333333D+00/ + DATA (CF(I),I= 76, 77) /48,32/ C 1 T(5,3,1) T(6,2,4) - DATA (CF(I, 12),I= 1, 6) /0.000000000000000D+00 - $ ,0.000000000000000D+00,5.333333333333333D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,5.333333333333333D+00/ - DATA (CF(I, 12),I= 7, 12) /0.000000000000000D+00 - $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,1.600000000000000D+01/ + DATA (CF(I),I= 78, 78) /48/ C 1 T(5,3,4) T(6,2,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -761,10 +681,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -773,6 +695,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index e6d6423d5e..1d17cbcb1b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -559,158 +615,43 @@ namespace mg5amcCpu jamp_sv[5] -= 1. / 12. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uux_ttxuux()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -798,7 +739,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -833,6 +778,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -875,6 +824,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -995,8 +948,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1004,25 +957,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1167,13 +1298,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1185,18 +1310,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1221,93 +1351,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1349,7 +1416,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1372,7 +1439,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1381,21 +1448,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1409,8 +1478,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1426,11 +1497,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1532,14 +1604,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h index de4fd12c37..923ce8ceb8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 14; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f index 639c7207e3..0f6ceae7f0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f index 8fc5eeb386..cc5891ef22 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -497,51 +501,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc new file mode 100644 index 0000000000..a1e583992a --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc @@ -0,0 +1,431 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/configs.inc index a45cbe8205..cc114056be 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/configs.inc @@ -195,3 +195,5 @@ C Diagram 14 DATA (SPROP(I,-4,14),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/14/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f index 9a6d844439..0070f4afc8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(16) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,39 +434,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(2,1) T(3,4) T(5,6) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(2,1) T(3,6) T(5,4) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(2,4) T(3,1) T(5,6) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(2,4) T(3,6) T(5,1) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(2,6) T(3,1) T(5,4) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(2,6) T(3,4) T(5,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -585,10 +565,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -597,6 +579,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index bf560d981f..3e1fcb02e3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,9 +103,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -111,10 +114,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -173,43 +173,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -221,7 +277,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -230,14 +285,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -263,14 +321,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -294,7 +348,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -308,7 +361,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -319,6 +371,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -449,158 +505,43 @@ namespace mg5amcCpu jamp_sv[5] -= 1. / 12. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uxcx_ttxuxcx()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -688,7 +629,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -723,6 +668,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -765,6 +714,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -885,8 +838,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -894,25 +847,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1057,13 +1188,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1075,18 +1200,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1111,93 +1241,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1239,7 +1306,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1262,7 +1329,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1271,21 +1338,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1299,8 +1368,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1316,11 +1387,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1422,14 +1494,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h index 13a02cdb83..a2ab984dd2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -80,6 +81,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -127,7 +129,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -135,9 +137,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -157,34 +161,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f index bf9951e502..07b686127b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f index 24b0abb30c..107d8a0051 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -142,7 +142,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF CX1=PDG2PDF(LPP(IB(1)),-4, IB(1),XBK(IB(1)), QSCALE) UX1=PDG2PDF(LPP(IB(1)),-2, IB(1),XBK(IB(1)), QSCALE) @@ -151,7 +151,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -243,7 +243,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -321,6 +321,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -406,20 +410,20 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) CX1(IVEC)=PDG2PDF(LPP(IB(1)),-4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) UX1(IVEC)=PDG2PDF(LPP(IB(1)),-2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) DX1(IVEC)=PDG2PDF(LPP(IB(1)),-1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -513,51 +517,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc new file mode 100644 index 0000000000..a1e583992a --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc @@ -0,0 +1,431 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/configs.inc index e2a9d0c352..bc8dbca9d7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/configs.inc @@ -105,3 +105,5 @@ C Diagram 7 DATA (SPROP(I,-4,7),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/7/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f index 2a76dfeffb..3be02200e4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -76,10 +76,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -280,17 +277,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -360,7 +346,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -408,7 +394,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(8) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -451,39 +438,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(1,4) T(2,5) T(3,6) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(1,4) T(2,6) T(3,5) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(1,5) T(2,4) T(3,6) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(1,5) T(2,6) T(3,4) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(1,6) T(2,4) T(3,5) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(1,6) T(2,5) T(3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -553,10 +533,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -565,6 +547,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index 83faf9192b..26b682be00 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,9 +101,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -109,10 +112,7 @@ namespace mg5amcCpu using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,43 +171,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -219,7 +275,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -228,14 +283,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -261,14 +319,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -292,7 +346,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -306,7 +359,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -317,6 +369,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -559,158 +615,43 @@ namespace mg5amcCpu jamp_sv[5] -= 1. / 12. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uxux_ttxuxux()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -798,7 +739,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -833,6 +778,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -875,6 +824,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -995,8 +948,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1004,25 +957,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1167,13 +1298,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1185,18 +1310,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -1221,93 +1351,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1349,7 +1416,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1372,7 +1439,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1381,21 +1448,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1409,8 +1478,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1426,11 +1497,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1532,14 +1604,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h index 0b67fca178..5623c32c4f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,6 +79,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 14; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -125,7 +127,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -133,9 +135,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -155,34 +159,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f index f8d2319067..ea3c698850 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f index f9adb0c2a2..5518a456a6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF CX1=PDG2PDF(LPP(IB(1)),-4, IB(1),XBK(IB(1)), QSCALE) SX1=PDG2PDF(LPP(IB(1)),-3, IB(1),XBK(IB(1)), QSCALE) @@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) CX1(IVEC)=PDG2PDF(LPP(IB(1)),-4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) SX1(IVEC)=PDG2PDF(LPP(IB(1)),-3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) UX1(IVEC)=PDG2PDF(LPP(IB(1)),-2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) DX1(IVEC)=PDG2PDF(LPP(IB(1)),-1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -497,51 +501,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc new file mode 100644 index 0000000000..a1e583992a --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc @@ -0,0 +1,431 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/configs.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/configs.inc index 9841fb23df..6a4d7d209f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/configs.inc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/configs.inc @@ -210,3 +210,5 @@ C Diagram 14 DATA (SPROP(I,-4,14),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/14/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f index 35761964e7..5a3d10c673 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -74,10 +74,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -278,17 +275,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -358,7 +344,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +390,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(16) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,39 +434,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(1,4) T(2,5) T(3,6) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(1,4) T(2,6) T(3,5) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(1,5) T(2,4) T(3,6) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(1,5) T(2,6) T(3,4) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(1,6) T(2,4) T(3,5) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(1,6) T(2,5) T(3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -585,10 +565,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -597,6 +579,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data b/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/madevent b/epochX/cudacpp/pp_tt012j.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/madevent +++ b/epochX/cudacpp/pp_tt012j.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h index 53dd560ed6..da11e740d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc index 47a3a011b8..a5e188e4f8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h index 76066c7bb1..24e0e80f84 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index 85f434b58f..ea45eb7817 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,17 +46,16 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -73,7 +72,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.12868547439575195  +DEBUG: model prefixing takes 0.1081535816192627  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -88,21 +87,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.699 s +1 processes with 72 diagrams generated in 4.226 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt INFO: remove old information in CODEGEN_mad_smeft_gg_tttt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -114,25 +113,25 @@ FileWriter t t~ t t~ WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx -DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1552]  -Generated helas calls for 1 subprocesses (72 diagrams) in 0.189 s -Wrote files for 119 helas calls in 0.388 s +DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1577]  +Generated helas calls for 1 subprocesses (72 diagrams) in 0.172 s +Wrote files for 119 helas calls in 0.454 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.319 s +ALOHA: aloha creates 5 routines in 0.635 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.341 s +ALOHA: aloha creates 10 routines in 0.570 s VVV5 VVV5 FFV1 @@ -142,38 +141,32 @@ ALOHA: aloha creates 10 routines in 0.341 s VVVV1 VVVV9 VVVV10 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses/P1_gg_ttxttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #2 succeeded at 275 (offset 48 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README Run "open index.html" to see more information about this process. quit -real 0m7.169s -user 0m6.853s -sys 0m0.298s -Code generation completed in 7 seconds +real 0m9.801s +user 0m8.912s +sys 0m0.731s +Code generation completed in 9 seconds ************************************************************ * * * W E L C O M E to * @@ -186,7 +179,7 @@ Code generation completed in 7 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -194,10 +187,9 @@ Code generation completed in 7 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -216,7 +208,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -224,10 +216,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/COPYRIGHT b/epochX/cudacpp/smeft_gg_tttt.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/COPYRIGHT +++ b/epochX/cudacpp/smeft_gg_tttt.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat index 9bcf8cac8c..33b9ca5c22 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat index 6b82577032..000832aacd 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat index b8db871c35..85e1d39035 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt b/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts b/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f b/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc b/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile b/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc b/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h index 2f711d8cc1..24800c08c9 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc index 96d77e5403..a1d3c787cf 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_SMEFTsim_topU3l_MwScheme_UFO_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 12; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities -#endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) +#endif + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -1394,164 +1450,43 @@ namespace mg5amcCpu jamp_sv[8] -= 1. / 2. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxttx()?) - - // The color denominators (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] - - // The color matrix (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 }, - { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 }, - { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 }, - { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 }, - { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 }, - { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 }, - { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 }, - { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 }, - { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 }, - { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 }, - { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, - { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -1639,7 +1574,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -1674,6 +1613,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -1716,6 +1659,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT ); m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT ); m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -1836,8 +1783,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1845,25 +1792,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -2008,13 +2133,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1024 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -2026,18 +2145,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -2062,93 +2186,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -2190,7 +2251,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -2213,7 +2274,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -2222,21 +2283,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -2250,8 +2313,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -2267,11 +2332,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -2373,14 +2439,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h index d207c3303f..c1de405ab1 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 72; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f index ef1e17705f..a3462226d4 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f index 2086a21e98..c42dfd786e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc new file mode 100644 index 0000000000..767405ac3b --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc @@ -0,0 +1,437 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] + + // The color matrix (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 }, + { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 }, + { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 }, + { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 }, + { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 }, + { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 }, + { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 }, + { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 }, + { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 }, + { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 }, + { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, + { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/configs.inc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/configs.inc index 3710cb6806..2038dc7a01 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/configs.inc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/configs.inc @@ -1020,3 +1020,5 @@ C Diagram 70 DATA (SPROP(I,-4,70),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/70/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f index f7f23196eb..5997e65826 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fbridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/makefile_original.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f index 45032ad41c..0f7d6543d2 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -275,17 +272,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -355,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -398,7 +384,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(34) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -441,111 +428,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /1.600000000000000D+01 - $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D - $ +00,0.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 1),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 12) /48,32,32,12,0,32,-4,0,-12,-4,-4,12/ C 1 T(1,2,3,4) T(5,6) - DATA (CF(I, 2),I= 1, 6) /5.333333333333333D+00 - $ ,1.600000000000000D+01,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,0.000000000000000D+00/ - DATA (CF(I, 2),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 13, 23) /48,12,32,32,0,0,-4,-4,-12,12,-4/ C 1 T(1,2,3,6) T(5,4) - DATA (CF(I, 3),I= 1, 6) /5.333333333333333D+00 - $ ,2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ - DATA (CF(I, 3),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,2.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 24, 33) /48,32,-4,0,0,32,-4,12,-12,-4/ C 1 T(1,2,5,4) T(3,6) - DATA (CF(I, 4),I= 1, 6) /2.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D - $ +01,0.000000000000000D+00,-6.666666666666666D-01/ - DATA (CF(I, 4),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00/ + DATA (CF(I),I= 34, 42) /48,0,-4,32,0,12,-4,-4,-12/ C 1 T(1,2,5,6) T(3,4) - DATA (CF(I, 5),I= 1, 6) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00/ - DATA (CF(I, 5),I= 7, 12) /5.333333333333333D+00 - $ ,2.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ + DATA (CF(I),I= 43, 50) /48,32,32,12,0,-4,32,0/ C 1 T(1,3,4) T(2,5,6) - DATA (CF(I, 6),I= 1, 6) /5.333333333333333D+00 - $ ,0.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D - $ +01/ - DATA (CF(I, 6),I= 7, 12) /2.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ + DATA (CF(I),I= 51, 57) /48,12,32,-4,0,0,32/ C 1 T(1,3,6) T(2,5,4) - DATA (CF(I, 7),I= 1, 6) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ - DATA (CF(I, 7),I= 7, 12) /1.600000000000000D+01 - $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ + DATA (CF(I),I= 58, 63) /48,32,32,0,0,-4/ C 1 T(1,5,4) T(2,3,6) - DATA (CF(I, 8),I= 1, 6) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00,2.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 8),I= 7, 12) /5.333333333333333D+00 - $ ,1.600000000000000D+01,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ + DATA (CF(I),I= 64, 68) /48,0,32,-4,0/ C 1 T(1,5,6) T(2,3,4) - DATA (CF(I, 9),I= 1, 6) /-2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 9),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ + DATA (CF(I),I= 69, 72) /48,32,32,12/ C 1 T(2,1,3,4) T(5,6) - DATA (CF(I, 10),I= 1, 6) /-6.666666666666666D-01, - $ -2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,0.000000000000000D+00/ - DATA (CF(I, 10),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D - $ +01,2.000000000000000D+00,5.333333333333333D+00/ + DATA (CF(I),I= 73, 75) /48,12,32/ C 1 T(2,1,3,6) T(5,4) - DATA (CF(I, 11),I= 1, 6) /-6.666666666666666D-01 - $ ,2.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ - DATA (CF(I, 11),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,2.000000000000000D - $ +00,1.600000000000000D+01,5.333333333333333D+00/ + DATA (CF(I),I= 76, 77) /48,32/ C 1 T(2,1,5,4) T(3,6) - DATA (CF(I, 12),I= 1, 6) /2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ - DATA (CF(I, 12),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,1.600000000000000D+01/ + DATA (CF(I),I= 78, 78) /48/ C 1 T(2,1,5,6) T(3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -910,10 +830,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -922,6 +844,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent b/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h index 98fc59d3ea..32bd465108 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc index e394058ac8..bbcb428317 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h index 6d053c0d16..93a221c714 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index 5444229389..f3ee9f80b4 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,17 +46,16 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -73,7 +72,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.12831377983093262  +DEBUG: model prefixing takes 0.08983516693115234  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -88,33 +87,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.671 s +1 processes with 72 diagrams generated in 3.162 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.186 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. +Generated helas calls for 1 subprocesses (72 diagrams) in 0.227 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.316 s +ALOHA: aloha creates 5 routines in 0.248 s VVV5 VVV5 FFV1 @@ -124,17 +123,17 @@ ALOHA: aloha creates 5 routines in 0.316 s VVVV1 VVVV9 VVVV10 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m5.073s -user 0m4.975s -sys 0m0.073s -Code generation completed in 5 seconds +real 0m4.505s +user 0m4.403s +sys 0m0.078s +Code generation completed in 4 seconds diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/COPYRIGHT b/epochX/cudacpp/smeft_gg_tttt.sa/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/COPYRIGHT +++ b/epochX/cudacpp/smeft_gg_tttt.sa/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h index 2f711d8cc1..24800c08c9 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc index 6a64c39915..0355ad5663 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_SMEFTsim_topU3l_MwScheme_UFO_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 12; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities -#endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) +#endif + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -1342,164 +1398,43 @@ namespace mg5amcCpu jamp_sv[8] -= 1. / 2. * amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxttx()?) - - // The color denominators (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] - - // The color matrix (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 }, - { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 }, - { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 }, - { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 }, - { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 }, - { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 }, - { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 }, - { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 }, - { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 }, - { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 }, - { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, - { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -1587,7 +1522,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -1622,6 +1561,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -1664,6 +1607,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT ); m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT ); m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -1784,8 +1731,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1793,25 +1740,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -1956,13 +2081,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1024 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1974,18 +2093,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -2010,93 +2134,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -2138,7 +2199,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -2161,7 +2222,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -2170,21 +2231,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -2198,8 +2261,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -2215,11 +2280,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -2321,14 +2387,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h index d207c3303f..c1de405ab1 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 72; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc new file mode 100644 index 0000000000..767405ac3b --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc @@ -0,0 +1,437 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] + + // The color matrix (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 }, + { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 }, + { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 }, + { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 }, + { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 }, + { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 }, + { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 }, + { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 }, + { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 }, + { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 }, + { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, + { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fbridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/makefile_original.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h index 98fc59d3ea..32bd465108 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc index e394058ac8..bbcb428317 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h index 6d053c0d16..93a221c714 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h index d3c4ca5695..7d34de72f8 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 1690ef1273..856e106f98 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -550,21 +549,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.124 s +1 processes with 6 diagrams generated in 0.091 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -576,57 +575,51 @@ FileWriter t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1552]  -Generated helas calls for 1 subprocesses (6 diagrams) in 0.009 s -Wrote files for 16 helas calls in 0.082 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1577]  +Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s +Wrote files for 16 helas calls in 0.096 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.186 s +ALOHA: aloha creates 3 routines in 0.146 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.184 s +ALOHA: aloha creates 6 routines in 0.144 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses/P1_gg_t1t1x; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -Hunk #2 succeeded at 215 (offset -12 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README Run "open index.html" to see more information about this process. quit -real 0m2.996s -user 0m2.690s -sys 0m0.299s +real 0m3.181s +user 0m2.732s +sys 0m0.440s Code generation completed in 3 seconds ************************************************************ * * @@ -640,7 +633,7 @@ Code generation completed in 3 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -648,10 +641,9 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -670,7 +662,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -678,10 +670,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/COPYRIGHT b/epochX/cudacpp/susy_gg_t1t1.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/COPYRIGHT +++ b/epochX/cudacpp/susy_gg_t1t1.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat index 9025117612..a5aa626839 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat index 6b82577032..000832aacd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat index b8db871c35..85e1d39035 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt b/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f b/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc b/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile b/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc b/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h index 5bd3053393..c5e79dc1b1 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc index 1b3601c86b..aa42f4a070 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_MSSM_SLHA2.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -403,154 +459,43 @@ namespace mg5amcCpu jamp_sv[1] += amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_t1t1x()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -578,7 +523,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -611,6 +560,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_Msu3 ); m_masses.push_back( m_pars->mdl_Msu3 ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_Msu3, (fptype)m_pars->mdl_Wsu3, (fptype)m_pars->mdl_Msu6, (fptype)m_pars->mdl_Wsu6 }; @@ -651,6 +604,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_MSSM_SLHA2::ZERO ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -771,8 +728,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -780,25 +737,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -943,13 +1078,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -961,18 +1090,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -997,93 +1131,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1125,7 +1196,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1148,7 +1219,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1157,21 +1228,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1185,8 +1258,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1202,11 +1277,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1308,14 +1384,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h index d48c729c48..f01e3c5efd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_MSSM_SLHA2.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 4; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 6; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f index 28f44ab169..6b4d390131 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f index 40fbb596f2..05a7d543d8 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc new file mode 100644 index 0000000000..b68b9250fd --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc @@ -0,0 +1,427 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/configs.inc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/configs.inc index cbcfeb2c9a..5e64cc3afe 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/configs.inc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/configs.inc @@ -42,3 +42,5 @@ C Diagram 5 DATA (SPROP(I,-2,5),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/5/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f index 3fc552a31d..5f9d807b6d 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fbridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/makefile_original.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f index 1a1830b77a..bbf79fd11b 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -215,17 +212,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -295,7 +281,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -338,7 +324,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -383,23 +370,31 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /5.333333333333333D+00, - $ -6.666666666666666D-01/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 2) /16,-4/ C 1 T(1,2,3,4) - DATA (CF(I, 2),I= 1, 2) /-6.666666666666666D-01 - $ ,5.333333333333333D+00/ + DATA (CF(I),I= 3, 3) /16/ C 1 T(2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WSU3.NE.0D0) FK_MDL_WSU3 = SIGN(MAX(ABS(MDL_WSU3), - $ ABS(MDL_MSU3*SMALL_WIDTH_TREATMENT)), MDL_WSU3) - IF(MDL_WSU6.NE.0D0) FK_MDL_WSU6 = SIGN(MAX(ABS(MDL_WSU6), - $ ABS(MDL_MSU6*SMALL_WIDTH_TREATMENT)), MDL_WSU6) + FK_ZERO = 0D0 + IF(MDL_WSU3.NE.0D0) THEN + FK_MDL_WSU3 = SIGN(MAX(ABS(MDL_WSU3), ABS(MDL_MSU3 + $ *SMALL_WIDTH_TREATMENT)), MDL_WSU3) + ELSE + FK_MDL_WSU3 = 0D0 + ENDIF + + IF(MDL_WSU6.NE.0D0) THEN + FK_MDL_WSU6 = SIGN(MAX(ABS(MDL_WSU6), ABS(MDL_MSU6 + $ *SMALL_WIDTH_TREATMENT)), MDL_WSU6) + ELSE + FK_MDL_WSU6 = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -451,10 +446,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -463,6 +460,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(2)=AMP2(2)+AMP(3)*DCONJG(AMP(3)) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent b/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h index ec627d7759..85c140d111 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc index d596fdf1ec..eafa38c4dd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h index 26a532156c..a9dc1dce79 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 45c009959b..0ef608d7aa 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -550,47 +549,47 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.122 s +1 processes with 6 diagrams generated in 0.100 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.182 s +ALOHA: aloha creates 3 routines in 0.151 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.324s -user 0m1.250s -sys 0m0.065s -Code generation completed in 1 seconds +real 0m1.343s +user 0m1.251s +sys 0m0.081s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/COPYRIGHT b/epochX/cudacpp/susy_gg_t1t1.sa/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/COPYRIGHT +++ b/epochX/cudacpp/susy_gg_t1t1.sa/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h index 5bd3053393..c5e79dc1b1 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc index 1d53b4a535..e8819f6df2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_MSSM_SLHA2.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -404,154 +460,43 @@ namespace mg5amcCpu jamp_sv[1] += amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_t1t1x()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -579,7 +524,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -612,6 +561,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_Msu3 ); m_masses.push_back( m_pars->mdl_Msu3 ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_Msu3, (fptype)m_pars->mdl_Wsu3, (fptype)m_pars->mdl_Msu6, (fptype)m_pars->mdl_Wsu6 }; @@ -652,6 +605,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_MSSM_SLHA2::ZERO ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -772,8 +729,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -781,25 +738,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -944,13 +1079,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -962,18 +1091,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -998,93 +1132,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1126,7 +1197,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1149,7 +1220,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1158,21 +1229,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1186,8 +1259,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1203,11 +1278,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1309,14 +1385,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h index d48c729c48..f01e3c5efd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_MSSM_SLHA2.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 4; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 6; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc new file mode 100644 index 0000000000..b68b9250fd --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc @@ -0,0 +1,427 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fbridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/makefile_original.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h index ec627d7759..85c140d111 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc index d596fdf1ec..eafa38c4dd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h index 26a532156c..a9dc1dce79 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h index d3c4ca5695..7d34de72f8 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 9e7dad46ce..96e4d4a727 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -550,21 +549,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.118 s +1 processes with 3 diagrams generated in 0.083 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt INFO: remove old information in CODEGEN_mad_susy_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -576,54 +575,49 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1552]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s -Wrote files for 10 helas calls in 0.076 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Wrote files for 10 helas calls in 0.071 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.137 s +ALOHA: aloha creates 2 routines in 0.149 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.133 s +ALOHA: aloha creates 4 routines in 0.122 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common -patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 -patching file driver.f -patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README +/home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m2.854s -user 0m2.558s -sys 0m0.284s -Code generation completed in 3 seconds +real 0m3.237s +user 0m2.790s +sys 0m0.439s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * @@ -636,7 +630,7 @@ Code generation completed in 3 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -644,10 +638,9 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -666,7 +659,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.5 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -674,10 +667,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_tt.mad/COPYRIGHT b/epochX/cudacpp/susy_gg_tt.mad/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/COPYRIGHT +++ b/epochX/cudacpp/susy_gg_tt.mad/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt index 68b4c46295..311ceaa803 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat index 25f63a3016..22710756d6 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.5 2025-10-17 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat index 6b82577032..000832aacd 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat index b8db871c35..85e1d39035 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt b/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts b/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f b/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc b/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/makefile b/epochX/cudacpp/susy_gg_tt.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/makefile +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc b/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt index 084e244cea..b55f10804f 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.5 \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h index 5bd3053393..c5e79dc1b1 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 5c62f1bfad..9eb05a51e9 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_MSSM_SLHA2.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -368,154 +424,43 @@ namespace mg5amcCpu jamp_sv[1] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -555,7 +500,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -588,6 +537,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -628,6 +581,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_MSSM_SLHA2::ZERO ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -748,8 +705,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -757,25 +714,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -920,13 +1055,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -938,18 +1067,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -974,93 +1108,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1102,7 +1173,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1125,7 +1196,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1134,21 +1205,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1162,8 +1235,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1179,11 +1254,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1285,14 +1361,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 24c27005b8..f74d539775 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_MSSM_SLHA2.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index bc9bcfeb9b..008afc92ae 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index db3c284caa..fc3ede89c4 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -442,51 +446,62 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) - DO IVEC=1,VECSIZE_USED - DSIGUU = ALL_OUT(IVEC) - IF (IMODE.EQ.5) THEN - IF (DSIGUU.LT.1D199) THEN - ALL_OUT(IVEC) = DSIGUU*CONV - ELSE - ALL_OUT(IVEC) = 0.0D0 - ENDIF - RETURN + DO CURR_WARP=1, NB_WARP_USED + IF(IMIRROR_VEC(CURR_WARP).EQ.1)THEN + IB(1) = 1 + IB(2) = 2 + ELSE + IB(1) = 2 + IB(2) = 1 ENDIF + DO IWARP=1, WARP_SIZE + IVEC = (CURR_WARP-1)*WARP_SIZE+IWARP + DSIGUU = ALL_OUT(IVEC) + IF (IMODE.EQ.5) THEN + IF (DSIGUU.LT.1D199) THEN + ALL_OUT(IVEC) = DSIGUU*CONV + ELSE + ALL_OUT(IVEC) = 0.0D0 + ENDIF + RETURN + ENDIF - XBK(:) = ALL_XBK(:,IVEC) -C CM_RAP = ALL_CM_RAP(IVEC) - Q2FACT(:) = ALL_Q2FACT(:, IVEC) + XBK(:) = ALL_XBK(:,IVEC) +C CM_RAP = ALL_CM_RAP(IVEC) + Q2FACT(:) = ALL_Q2FACT(:, IVEC) - IF(FRAME_ID.NE.6)THEN - CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) - ELSE - P1 = ALL_PP(:,:,IVEC) - ENDIF -C call restore_cl_val_to(ivec) -C DSIGUU=DSIGUU*REWGT(P1,ivec) - DSIGUU=DSIGUU*ALL_RWGT(IVEC) + IF(FRAME_ID.NE.6)THEN + CALL BOOST_TO_FRAME(ALL_PP(0,1,IVEC), FRAME_ID, P1) + ELSE + P1 = ALL_PP(:,:,IVEC) + ENDIF +C call restore_cl_val_to(ivec) +C DSIGUU=DSIGUU*REWGT(P1,ivec) + DSIGUU=DSIGUU*ALL_RWGT(IVEC) -C Apply the bias weight specified in the run card (default is -C 1.0) - DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) +C Apply the bias weight specified in the run card (default is +C 1.0) + DSIGUU=DSIGUU*CUSTOM_BIAS(P1,DSIGUU,1, IVEC) - DSIGUU=DSIGUU*NFACT + DSIGUU=DSIGUU*NFACT - IF (DSIGUU.LT.1D199) THEN -C Set sign of dsig based on sign of PDF and matrix element - ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU - $ *ALL_PD(IPSEL,IVEC)) - ELSE - WRITE(*,*) 'Error in matrix element' - DSIGUU=0D0 - ALL_OUT(IVEC)=0D0 - ENDIF -C Generate events only if IMODE is 0. - IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN -C Call UNWGT to unweight and store events - CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, - $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) - ENDIF + IF (DSIGUU.LT.1D199) THEN +C Set sign of dsig based on sign of PDF and matrix element + ALL_OUT(IVEC)=DSIGN(CONV*ALL_PD(0,IVEC)*DSIGUU,DSIGUU + $ *ALL_PD(IPSEL,IVEC)) + ELSE + WRITE(*,*) 'Error in matrix element' + DSIGUU=0D0 + ALL_OUT(IVEC)=0D0 + ENDIF +C Generate events only if IMODE is 0. + IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN +C Call UNWGT to unweight and store events + ICONFIG = SYMCONF(ICONF_VEC(CURR_WARP)) + CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1 + $ , SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) + ENDIF + ENDDO ENDDO END diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc new file mode 100644 index 0000000000..b68b9250fd --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc @@ -0,0 +1,427 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc index 99d3eecc56..0dbac30825 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/configs.inc @@ -24,3 +24,5 @@ C Diagram 3 DATA (SPROP(I,-2,3),I=1,1)/0/ C Number of configs DATA MAPCONFIG(0)/3/ +C used fake id + DATA FAKE_ID/7/ diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f index ec5722702a..30cca27587 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f @@ -76,6 +76,7 @@ Program DRIVER include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) INTEGER VECSIZE_USED + DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime character*255 env_name, env_value integer env_length, env_status @@ -121,7 +122,6 @@ Program DRIVER endif #endif - vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP env_name = 'CUDACPP_RUNTIME_VECSIZEUSED' call get_environment_variable(env_name, env_value, env_length, env_status) if( env_status.eq.0 ) then @@ -147,6 +147,7 @@ Program DRIVER FBRIDGE_CBYF1MAX = -1D100 FBRIDGE_CBYF1MIN = 1D100 #endif + c c Read process number c @@ -280,6 +281,7 @@ Program DRIVER c write(*,*) 'Final xsec: ',xsec rewind(lun) + close(lun) #ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -307,6 +309,7 @@ Program DRIVER ENDIF #endif CALL COUNTERS_FINALISE() + end c $B$ get_user_params $B$ ! tag for MadWeight @@ -489,7 +492,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fbridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/makefile_original.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f index c9610a83ed..35011737bd 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -71,10 +71,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, DATA NB_FAIL /0/ DOUBLE PRECISION GET_CHANNEL_CUT EXTERNAL GET_CHANNEL_CUT -C - INTEGER NGOODHEL ! -1 if not yet retrieved and printed - SAVE NGOODHEL - DATA NGOODHEL/-1/ + C C This is just to temporarily store the reference grid for C helicity of the DiscreteSampler so as to obtain its number of @@ -227,17 +224,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, ENDIF IF(NTRY(1).EQ.MAXTRIES)THEN ISHEL=MIN(ISUM_HEL,NGOOD) -C Print the number of good helicities - IF (NGOODHEL.EQ.-1) THEN - NGOODHEL=0 - DO I=1,NCOMB - IF (GOODHEL(I,1)) THEN - NGOODHEL=NGOODHEL+1 - ENDIF - END DO - WRITE (6,*) 'NGOODHEL =', NGOODHEL - WRITE (6,*) 'NCOMB =', NCOMB - ENDIF ENDIF ENDIF ELSE IF (.NOT.INIT_MODE) THEN ! random helicity @@ -307,7 +293,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -350,7 +336,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -393,21 +380,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /5.333333333333333D+00, - $ -6.666666666666666D-01/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 2) /16,-4/ C 1 T(1,2,3,4) - DATA (CF(I, 2),I= 1, 2) /-6.666666666666666D-01 - $ ,5.333333333333333D+00/ + DATA (CF(I),I= 3, 3) /16/ C 1 T(2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -446,10 +436,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -458,6 +450,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f index 9a31ed201d..d6cded9a2d 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else @@ -220,7 +221,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, ncolmp=0 endif if(mo_color.gt.1.and. - $ mo_color.ne.3.and.mo_color.ne.8)then + $ mo_color.ne.3.and.mo_color.ne.8.and.mo_color.ne.6)then da_color(1)=get_color(jpart(1,ida(1))) da_color(2)=get_color(jpart(1,ida(2))) call write_error(da_color(1), da_color(2), mo_color) @@ -326,8 +327,8 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif endif !end of check on LC -c Just zero helicity info for intermediate states - jpart(7,i) = 0 +c Just No helicity info for intermediate states + jpart(7,i) = 9 enddo ! do i 100 continue if (is_LC) call check_pure_internal_flow(icolalt,jpart, maxcolor) @@ -586,13 +587,13 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, i3=i3+1 c color for t-channels needs to be reversed if(i3.eq.1) icol(2,ires)=icolmp(1,i) - if(i3.eq.2) icol(1,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 c color for t-channels needs to be reversed if(i3bar.eq.1) icol(1,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(2,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(2,i) endif enddo @@ -764,6 +765,14 @@ function fix_tchannel_color(mo_color,maxcolor,ncolmp,icolmp,ires, endif endif c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) + elseif(mo_color.eq.6.and.i3.eq.0.and.i3bar.eq.2)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue + elseif(mo_color.eq.6.and.i3.eq.2.and.i3bar.eq.0)then +c correct +c might consider to undo the identical final state for epsilon/epsilonbar + continue else c Don't know how to deal with this call write_error(i3,i3bar,mo_color) @@ -814,12 +823,12 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(icolmp(1,i).gt.0)then i3=i3+1 if(i3.eq.1) icol(1,ires)=icolmp(1,i) - if(i3.eq.2) icol(2,ires)=-icolmp(1,i) + if(i3.eq.2.and.icol(2,ires).eq.0) icol(2,ires)=-icolmp(1,i) endif if(icolmp(2,i).gt.0)then i3bar=i3bar+1 if(i3bar.eq.1) icol(2,ires)=icolmp(2,i) - if(i3bar.eq.2) icol(1,ires)=-icolmp(2,i) + if(i3bar.eq.2.and.icol(1,ires).eq.0) icol(1,ires)=-icolmp(2,i) endif enddo @@ -830,23 +839,33 @@ function elim_indices(n3,n3bar,ncolmp,icolmp,ires,icol, if(n3.le.1.and.n3bar.eq.0) icol(2,ires)=0 if(i3.ne.n3.or.i3bar.ne.n3bar) then - if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.0.and.i3.eq.0)then + if(n3.gt.0.and.n3bar.eq.0.and.mod(i3bar+n3,3).eq.i3)then c This is an epsilon index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(1,ires)=maxcolor + if(i3.eq.0) then + maxcolor=maxcolor+1 + icol(1,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(2,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(2,ires)=-maxcolor endif - elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.0.and.i3bar.eq.0)then + elseif(n3bar.gt.0.and.n3.eq.0.and.mod(i3+n3bar,3).eq.i3bar)then c This is an epsilonbar index interaction c write(*,*) i3, n3, i3bar, n3bar, ires - maxcolor=maxcolor+1 - icol(2,ires)=maxcolor + if(i3bar.eq.0)then + maxcolor=maxcolor+1 + icol(2,ires)=maxcolor + endif if(n3.eq.2)then maxcolor=maxcolor+1 icol(1,ires)=-maxcolor + elseif(n3bar.eq.2)then + maxcolor=maxcolor+1 + icol(1,ires)=-maxcolor endif elseif(n3.gt.0.and.n3bar.eq.0.and.i3-i3bar.eq.n3.or. $ n3bar.gt.0.and.n3.eq.0.and.i3bar-i3.eq.n3bar.or. @@ -961,6 +980,12 @@ subroutine fix_s_color_indices(n3,n3bar,i3,i3bar,ncolmp,icolmp, if(n3.eq.1) icol(1,ires)=max_n3 if(n3bar.eq.1) icol(2,ires)=min_n3bar endif + do i=ires,-1 + if (icol(1,i).eq.maxcol) icol(1,i)=mincol + if (icol(1,i).eq.-maxcol) icol(1,i)=-mincol + if (icol(2,i).eq.maxcol) icol(2,i)=mincol + if (icol(2,i).eq.-maxcol) icol(2,i)=-mincol + enddo c print *,'Set mother color for ',ires,' to ',(icol(j,ires),j=1,2) endif else diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f index 1c32e93f5d..5449ab9e30 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f @@ -124,7 +124,8 @@ subroutine gen_mom(iconfig,mincfig,maxcfig,invar,wgt,x,p1) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer fake_id + common/to_sprop/sprop,tprid,fake_id logical firsttime double precision xprop(3,nexternal),tprop(3,nexternal) @@ -1373,6 +1374,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1389,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1637,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1657,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1818,8 +1834,8 @@ double precision function get_channel_cut(p, config) common/to_forest/ iforest, tstrategy integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) - integer tprid(-max_branch:-1,lmaxconfigs) - common/to_sprop/sprop,tprid + integer tprid(-max_branch:-1,lmaxconfigs), fake_id + common/to_sprop/sprop,tprid,fake_id double precision stot,m1,m2 common/to_stot/stot,m1,m2 @@ -1915,7 +1931,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1946,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile deleted file mode 100644 index 49e6800fff..0000000000 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile +++ /dev/null @@ -1,327 +0,0 @@ -SHELL := /bin/bash - -include ../../Source/make_opts - -# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) -# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing -include ../../src/cudacpp_config.mk -ifeq ($(CUDACPP_BUILDDIR),) -$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) -endif - -# Disable all Fortran warnings? -FFLAGS+= -w - -# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -FFLAGS+= -cpp - -# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) -CXXFLAGS = -O3 -Wall -Wshadow -Wextra - -# Add -std=c++17 explicitly to avoid build errors on macOS -# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" -ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 -endif - -# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1) -### override FC:=ccache $(FC) -###endif - -# Load additional dependencies of the bias module, if present -ifeq (,$(wildcard ../bias_dependencies)) -BIASDEPENDENCIES = -else -include ../bias_dependencies -endif - -# Definitions - -LIBDIR = ../../lib/ -BINDIR = ../../bin/ -PROG = madevent - -ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") - include ../MadLoop_makefile_definitions -else - LINK_LOOP_LIBS = - LOOP_LIBS = - LOOP_INCLUDE = - LINK_MADLOOP_LIB = - MADLOOP_LIB = -endif - -LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias - -CUDACPP_MAKEFILE=cudacpp.mk -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -ifeq ($(BACKEND),cuda) -CUDACPP_COMMONLIB=mg5amc_common_cuda -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda -else ifeq ($(BACKEND),hip) -CUDACPP_COMMONLIB=mg5amc_common_hip -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip -else -CUDACPP_COMMONLIB=mg5amc_common_cpp -CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp -endif - -LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) - -ifneq ("$(wildcard ../../Source/RUNNING)","") - LINKLIBS += -lrunning - LIBS += $(LIBDIR)librunning.$(libext) -endif - - -# Source files - -MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) -MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) -ifeq ($(strip $(MATRIX_HEL)),) - MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) -endif - - -PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \ - cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ - idenparts.o dummy_fct.o - -DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) -DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) - -SYMMETRY = symmetry.o idenparts.o - -# Binaries - -ifeq ($(UNAME),Darwin) -LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) -LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked" -else -LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS) -endif - -# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) -.DEFAULT_GOAL := all - -ifeq ($(BACKEND),cuda) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda -else ifeq ($(BACKEND),hip) -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip -else -all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp -endif - -# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758) -ifeq ($(USEOPENMP),1) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -LINKLIBS += -liomp5 # see #578 -LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang -else -override OMPFLAGS = -fopenmp -endif -endif - -$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) - -$(LIBS): .libs - -.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat - cd ../../Source; make - touch $@ - -$(CUDACPP_BUILDDIR)/.cudacpplibs: - $(MAKE) -f $(CUDACPP_MAKEFILE) - touch $@ - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override LIBFLAGSRPATH = -else ifeq ($(USEBUILDDIR),1) - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' -else - override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' -endif - -.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link - -madevent_fortran_link: $(PROG)_fortran - rm -f $(PROG) - ln -s $(PROG)_fortran $(PROG) - -madevent_cuda_link: - $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) - -madevent_hip_link: - $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) - -madevent_cpp_link: - $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto -madevent_%_link: - @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi - $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp - rm -f $(PROG) - ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) - -# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) -$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_cuda now uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -# Building $(PROG)_hip also uses its own rule -$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - -counters.o: counters.cc timer.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ - -$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) - -gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) - -###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile! -###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat -### cd ../../Source/MODEL; make -### -###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat -### cd ../../Source; make -### -###$(LIBDIR)libpdf.$(libext): -### cd ../../Source/PDF; make -### -###$(LIBDIR)libgammaUPC.$(libext): -### cd ../../Source/PDF/gammaUPC; make -###endif - -# Add source so that the compiler finds the DiscreteSampler module. -$(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC -%_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ - -# Dependencies - -driver.f: genps.inc -symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc -genps.o: genps.inc nexternal.inc configs.inc -dummy_fct.0: run.inc genps.inc -cuts.o: genps.inc nexternal.inc pmass.inc -setcuts.o: genps.inc run_config.inc -invarients.o: genps.inc nexternal.inc -myamp.o: props.inc genps.inc nexternal.inc -reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ - run_config.inc -cluster.o: cluster.inc genps.inc nexternal.inc message.inc -addmothers.o: genps.inc nexternal.inc symswap.inc message.inc -unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ - run_config.inc -initcluster.o: message.inc - -# Extra dependencies on discretesampler.mod - -auto_dsig.o: .libs -driver.o: .libs -driver_cudacpp.o: .libs -$(MATRIX): .libs -genps.o: .libs - -# Cudacpp bldall targets - -ifeq ($(UNAME_P),ppc64le) -bldavxs: bldnone bldsse4 -else ifeq ($(UNAME_P),arm) -bldavxs: bldnone bldsse4 -else -bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z -endif - -ifneq ($(shell which hipcc 2>/dev/null),) -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldhip bldcuda bldavxs -else -bldall: bldhip bldavxs -endif -else -ifneq ($(shell which nvcc 2>/dev/null),) -bldall: bldcuda bldavxs -else -bldall: bldavxs -endif -endif - -bldcuda: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cuda - -bldhip: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=hip - -bldnone: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppnone - -bldsse4: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 - -bldavx2: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 - -bld512y: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y - -bld512z: $(PROG)_fortran $(DSIG_cudacpp) - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z - -# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) - -clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn - $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip - -cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src - $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall - rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs - rm -f .libs - -cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src - make -C ../../Source cleanall - rm -rf $(LIBDIR)libbias.$(libext) - rm -f ../../Source/*.mod ../../Source/*/*.mod - -distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation - $(MAKE) -f $(CUDACPP_MAKEFILE) distclean diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile new file mode 120000 index 0000000000..9fba275947 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile @@ -0,0 +1 @@ +makefile_wrapper.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk new file mode 100644 index 0000000000..348c283be7 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_original.mk @@ -0,0 +1,101 @@ +include ../../Source/make_opts +FFLAGS+= -w + +# Load additional dependencies of the bias module, if present +ifeq (,$(wildcard ../bias_dependencies)) +BIASDEPENDENCIES = +else +include ../bias_dependencies +endif + +# Definitions + +LIBDIR = ../../lib/ +BINDIR = ../../bin/ +PROG = madevent + +ifneq ("$(wildcard ../MadLoop_makefile_definitions)","") + include ../MadLoop_makefile_definitions +else + LINK_LOOP_LIBS = + LOOP_LIBS = + LOOP_INCLUDE = + LINK_MADLOOP_LIB = + MADLOOP_LIB = +endif + +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias + +LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) + +ifneq ("$(wildcard ../../Source/RUNNING)","") + LINKLIBS += -lrunning + LIBS += $(LIBDIR)librunning.$(libext) +endif + + +# Source files + +MATRIX_HEL = $(patsubst %.f,%.o,$(wildcard matrix*_orig.f)) +MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*_optim.f)) +ifeq ($(strip $(MATRIX_HEL)),) + MATRIX = $(patsubst %.f,%.o,$(wildcard matrix*.f)) +endif + + +PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o \ + $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \ + +SYMMETRY = symmetry.o idenparts.o + +# Binaries + +$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX) + $(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) + +$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat + cd ../../Source/MODEL; make + +$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat + cd ../../Source; make + +$(LIBDIR)libpdf.$(libext): + cd ../../Source/PDF; make + +$(LIBDIR)libgammaUPC.$(libext): + cd ../../Source/PDF/gammaUPC; make + +# Add source so that the compiler finds the DiscreteSampler module. +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +# Dependencies + +driver.f: genps.inc +symmetry.o: genps.inc nexternal.inc configs.inc run_config.inc ../../Source/run_card.inc +genps.o: genps.inc nexternal.inc configs.inc +dummy_fct.0: run.inc genps.inc +cuts.o: genps.inc nexternal.inc pmass.inc +setcuts.o: genps.inc run_config.inc +invarients.o: genps.inc nexternal.inc +myamp.o: props.inc genps.inc nexternal.inc +reweight.o: sudakov.inc cluster.inc sudakov.inc run.inc message.inc \ + run_config.inc +cluster.o: cluster.inc genps.inc nexternal.inc message.inc +addmothers.o: genps.inc nexternal.inc symswap.inc message.inc +unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ + run_config.inc +initcluster.o: message.inc + +clean: + $(RM) *.o gensym madevent madevent_forhel diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f index 0a0bafa7c1..9d8fe1c4f0 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f @@ -976,9 +976,9 @@ logical function setclscales(p, keepq2bck, ivec) $ ' and jcentral is ',jcentral(1),jcentral(2) if (btest(mlevel,3)) then - write(*,'(a$)') 'QCD jets (final): ' + write(*,'(a,$)') 'QCD jets (final): ' do i=3,nexternal - if(iqjets(i).gt.0) write(*,'(i3$)') i + if(iqjets(i).gt.0) write(*,'(i3,$)') i enddo write(*,*) endif @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) @@ -1387,7 +1387,9 @@ double precision function rewgt(p, ivec) integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) - include 'configs.inc' + integer fake_id + common/to_sprop/sprop,tprid,fake_id +c include 'configs.inc' real*8 xptj,xptb,xpta,xptl,xmtc real*8 xetamin,xqcut,deltaeta common /to_specxpt/xptj,xptb,xpta,xptl,xmtc,xetamin,xqcut,deltaeta @@ -1588,6 +1590,8 @@ double precision function rewgt(p, ivec) $ ipdgcl(1,igraphs(1),iproc),ipart,.false.).and. $ (goodjet(idacl(n,1)).or.goodjet(idacl(n,2)))) then c alpha_s weight + + if(ipdgcl(imocl(n),igraphs(1),iproc).ne.fake_id)then rewgt=rewgt*alphas(alpsfact*sqrt(q2now))/asref c Store information for systematics studies if(use_syst)then @@ -1600,6 +1604,7 @@ double precision function rewgt(p, ivec) write(*,*)' as: ',alphas(alpsfact*dsqrt(q2now)), & '/',asref,' -> ',alphas(alpsfact*dsqrt(q2now))/asref write(*,*)' and G=',SQRT(4d0*PI*ALPHAS(scale)) + endif endif endif endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f index 309540a0a2..d0706e90b4 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f @@ -51,6 +51,7 @@ program symmetry integer tstrategy(lmaxconfigs) integer sprop(maxsproc,-max_branch:-1,lmaxconfigs) integer tprid(-max_branch:-1,lmaxconfigs) + integer fake_id include 'configs.inc' data use_config/0,lmaxconfigs*0/ @@ -232,7 +233,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +243,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +261,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py index 42d82818d0..2bc6174b85 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3306,7 +3353,7 @@ def edit_dummy_fct_from_file(self, filelist, outdir): def retro_compatible_custom_fct(lines, mode=None): f77_type = ['real*8', 'integer', 'double precision', 'logical'] - function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ + function_pat = re.compile(r'^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ % {'type':'|'.join(f77_type)}, re.I+re.M) include_pat = re.compile(r"\s+include\s+[\'\"]([\w\./]*)") @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py index 9ff7390cf5..8de498fcc2 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" @@ -6868,8 +6890,9 @@ def handle_alarm(signum, frame): else: log_level=20 - - if run_card: + if run_card and (run_card['lpp1'] !=0 or run_card['lpp2'] !=0): + # They are likely case like lpp=+-3, where alpas not need reset + # but those have dedicated name of pdf avoid the reset as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, 'cteq6_l': 0.118, diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py index 51ae2914fc..0883cd9613 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2405,10 +2404,10 @@ def output(self, path, format='gnuplot',number_of_ratios = -1, gnuplot_output_list=gnuplot_output_list_v5 else: output, _ = p.communicate() - output.decode(errors='ignore') + output = output.decode(errors='ignore') if not output: gnuplot_output_list=gnuplot_output_list_v5 - elif float(output.split()[1]) < 5. : + elif int(output.split()[1].split('.')[0]) < 5 : gnuplot_output_list=gnuplot_output_list_v4 else: gnuplot_output_list=gnuplot_output_list_v5 @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py index f6e47956cd..d4b94bab10 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -1792,7 +1794,10 @@ def add_decays(self, pdg_to_decay): if particle.pdg in pdg_to_decay and pdg_to_decay[particle.pdg]: one_decay = pdg_to_decay[particle.pdg].pop() self.add_decay_to_particle(i, one_decay) + particle.helicity = 9 return self.add_decays(pdg_to_decay) + + return self @@ -2166,10 +2171,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2961,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..dea35930ea 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3667,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_comine_iteration(self, line): + def do_combine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -5596,6 +5635,19 @@ def do_plot(self, line): else: logger.info('No valid files for delphes plot') + def do_compile(self, line): + """compile the current directory """ + + args = self.split_arg(line) + self.ask_run_configuration(mode='parton') + self.run_card = banner_mod.RunCard(pjoin(self.me_dir, 'Cards', 'run_card.dat')) + self.configure_directory(html_opening =False) + + for Pdir in self.get_Pdir(): + misc.sprint(Pdir) + self.compile(['gensym'], cwd=Pdir) + self.compile(['madevent_forhel'], cwd=Pdir) + ############################################################################ def do_syscalc(self, line): """Evaluate systematics variation weights for a given run""" @@ -6132,7 +6184,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file.' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6896,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6906,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7023,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7051,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7066,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7078,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7203,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7223,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/madevent b/epochX/cudacpp/susy_gg_tt.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/madevent +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h index 9ed58e24f1..f5c68fb7c4 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc index aa00d6a9e4..0fd9310ffa 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h index 3e29f2ccbe..5a7f431dc1 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h index 7c6a082392..be5c5a6357 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 420090461f..daecfb0066 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.5 2025-10-17 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -550,45 +549,45 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.118 s +1 processes with 3 diagrams generated in 0.081 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.136 s +ALOHA: aloha creates 2 routines in 0.108 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/hack_ihel3_sep25_pr/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.291s -user 0m1.202s -sys 0m0.072s -Code generation completed in 2 seconds +real 0m1.145s +user 0m1.053s +sys 0m0.085s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT b/epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT index e4a5daf207..d5f6746559 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT +++ b/epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright (C) 2020-2024 CERN and UCLouvain. +Copyright (C) 2020-2025 CERN and UCLouvain. Licensed under the GNU Lesser General Public License (version 3 or later). All rights not expressly granted are reserved. @@ -13,6 +13,7 @@ initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on CPUs using vectorized C++ by three original authors from CERN and UCLouvain. The full development team currently includes the following authors : Stephan Hageboeck (CERN) + Daniele Massaro (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) Jorgen Teig (CERN) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h index 87aa648dd2..4e3f17e0dd 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h @@ -1,7 +1,8 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten +// (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -15,10 +16,9 @@ #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc //#ifdef __HIPCC__ -//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -//#else -//#include // bypass this completely to ease portability on LUMI #803 -//#endif +//#include // see +//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include +// // bypass this completely to ease portability on LUMI #803 #endif #include // bypass std::filesystem #803 @@ -38,9 +38,10 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- /** - * A base class for a class whose pointer is passed between Fortran and C++. - * This is not really necessary, but it allows minimal type checks on all such pointers. - */ + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such + * pointers. + */ struct CppObjectInFortran { CppObjectInFortran() {} @@ -49,39 +50,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /** - * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. - * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). - * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. - * - * The Fortran momenta passed in are in the form of - * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) - * where the dimensions are , , . - * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. - * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. - * The Bridge is configured to store nevt==nevtF events in CUDA/C++. - * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. - * - * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. - * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. - * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. - * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). - */ + * A templated class for calling the CUDA/C++ matrix element calculations of the + * event generation workflow. The FORTRANFPTYPE template parameter indicates the + * precision of the Fortran momenta from MadEvent (float or double). The + * precision of the matrix element calculation is hardcoded in the fptype + * typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , + * . In memory, this is stored in a way that C reads as an array + * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an + * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is + * configured to store nevt==nevtF events in CUDA/C++. It also checks that + * Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ + * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use + * double or float. In the check_sa "--bridge" test, everything is implemented + * in fptype (double or float). + */ template class Bridge final : public CppObjectInFortran { public: /** - * Constructor - * - * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) - * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) - */ + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array + * loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in + * Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays + * (KEPT FOR SANITY CHECKS ONLY) + */ Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); /** - * Destructor - */ + * Destructor + */ virtual ~Bridge() {} // Delete copy/move constructors and assignment operators @@ -92,74 +100,70 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL /** - * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads - * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) - * - * @param gpublocks number of gpublocks - * @param gputhreads number of gputhreads - */ + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != + * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for + * actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ void set_gpugrid( const int gpublocks, const int gputhreads ); /** - * Sequence to be executed for the Cuda matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void gpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #else /** - * Sequence to be executed for the vectorized CPU matrix element calculation - * - * @param momenta the pointer to the input 4-momenta - * @param gs the pointer to the input Gs (running QCD coupling constant alphas) - * @param rndhel the pointer to the input random numbers for helicity selection - * @param rndcol the pointer to the input random numbers for color selection - * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n - * @param mes the pointer to the output matrix elements - * @param selhel the pointer to the output selected helicities - * @param selcol the pointer to the output selected colors - * @param goodHelOnly quit after computing good helicities? - */ - void cpu_sequence( const FORTRANFPTYPE* momenta, - const FORTRANFPTYPE* gs, - const FORTRANFPTYPE* rndhel, - const FORTRANFPTYPE* rndcol, - const unsigned int* channelIds, - FORTRANFPTYPE* mes, - int* selhel, - int* selcol, - const bool goodHelOnly = false ); + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant + * alphas) + * @param rndhel the pointer to the input random numbers for helicity + * selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 + * to n + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false ); #endif - // Return the number of good helicities (-1 initially when they have not yet been calculated) + // Return the number of good helicities (-1 initially when they have not yet + // been calculated) int nGoodHel() const { return m_nGoodHel; } - // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + // Return the total number of helicities (expose cudacpp ncomb in the Bridge + // interface to Fortran) constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events - int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + int m_nGoodHel; // the number of good helicities (-1 initially when they have + // not yet been calculated) #ifdef MGONGPUCPP_GPUIMPL - int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) - int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + int m_gputhreads; // number of gpu threads (default set from number of + // events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, + // can be modified) DeviceBuffer m_devMomentaF; DeviceBufferMomenta m_devMomentaC; DeviceBufferGs m_devGs; @@ -177,8 +181,10 @@ namespace mg5amcCpu PinnedHostBufferSelectedColor m_hstSelCol; PinnedHostBufferChannelIds m_hstChannelIds; std::unique_ptr m_pmek; - //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) - static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) + // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads + // (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = + 32; // minimum number of gpu threads (DEFAULT) #else HostBufferMomenta m_hstMomentaC; HostBufferGs m_hstGs; @@ -217,8 +223,7 @@ namespace mg5amcCpu template Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) - : m_nevt( nevtF ) - , m_nGoodHel( -1 ) + : m_nevt( nevtF ), m_nGoodHel( -1 ) #ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads @@ -243,55 +248,90 @@ namespace mg5amcCpu , m_hstChannelIds( m_nevt ) , m_pmek( nullptr ) { - if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) + throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) + throw std::runtime_error( "Bridge constructor: np4 mismatch" ); #ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) - throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + throw std::runtime_error( + "Bridge constructor: nevt should be a multiple of " + + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) { m_gputhreads /= 2; if( m_gputhreads < s_gputhreadsmin ) - throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + throw std::logic_error( + "Bridge constructor: FIXME! cannot choose gputhreads" ); // this + // should + // never + // happen! m_gpublocks = m_nevt / m_gputhreads; } - std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelDevice( + m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else - std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" + << std::endl; +#endif + m_pmek.reset( new MatrixElementKernelHost( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + // FIXME: the process instance can happily go out of scope because it is only + // needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate + // is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); - std::string paramCard = "../../Cards/param_card.dat"; + std::string paramCard = + "../Cards/param_card.dat"; // ZW: change default param_card.dat location + // to one dir down /* #ifdef __HIPCC__ - if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#else - if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; -#endif - */ - //struct stat dummybuffer; // bypass std::filesystem #803 - //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + +paramCard; #endif + */ + // struct stat dummybuffer; // bypass std::filesystem #803 + // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + + // paramCard; // auto fileExists = []( std::string& fileName ) - { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; - if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + { + struct stat buffer; + return stat( fileName.c_str(), &buffer ) == 0; + }; + size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up + for( size_t k = 0; k < paramCardCheck; ++k ) + { + if( fileExists( paramCard ) ) break; // bypass std::filesystem #803 + paramCard = "../" + paramCard; + } process.initProc( paramCard ); } #ifdef MGONGPUCPP_GPUIMPL template - void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + void Bridge::set_gpugrid( const int gpublocks, + const int gputhreads ) { if( m_nevt != gpublocks * gputhreads ) - throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + throw std::runtime_error( + "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; - std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads - << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#ifdef MGONGPUCPP_VERBOSE + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt + << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" + << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -316,8 +356,12 @@ namespace mg5amcCpu else { gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); - const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) - //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + const int thrPerEvt = + CPPProcess::npar * + CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 + // event per thread) + // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... + // this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) @@ -333,8 +377,11 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated with gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT used later on + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated with + // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT + // used later on copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -342,12 +389,16 @@ namespace mg5amcCpu if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -391,16 +442,22 @@ namespace mg5amcCpu std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); } const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly ); - if( useChannelIds ) memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); - //else ... // no need to initialize m_hstChannel: it is allocated and default initialized in HostBufferBase (and it is not used later on anyway) + if( useChannelIds ) + memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) ); + // else ... // no need to initialize m_hstChannel: it is allocated and default + // initialized in HostBufferBase (and it is not used later on anyway) if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); - if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + if( m_nGoodHel < 0 ) + throw std::runtime_error( + "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); @@ -419,7 +476,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // // Implementations of transposition methods - // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> + // p_multi[nevtF][nparF][np4F] in C++ (AOS) // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // @@ -444,30 +502,31 @@ namespace mg5amcCpu int rest_2 = rest_1 % ( strd * mome ); int mome_i = rest_2 / strd; int strd_i = rest_2 % strd; - int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - out[pos] = in[inpos]; // F2C (Fortran to C) + int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) } } else { - // AV attempt another implementation with 1 event per thread: this seems slower... - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation with 1 event per thread: this seems + // slower... F-style: AOS[nevtF][nparF][np4F] C-style: + // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM constexpr int npar = CPPProcess::npar; constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; int ipagM = ievt / neppM; int ieppM = ievt % neppM; for( int ip4 = 0; ip4 < np4; ip4++ ) for( int ipar = 0; ipar < npar; ipar++ ) { - int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + + ip4 * neppM + ieppM; int fpos = ievt * npar * np4 + ipar * np4 + ip4; out[cpos] = in[fpos]; // F2C (Fortran to C) } @@ -494,23 +553,23 @@ namespace mg5amcCpu unsigned int rest_2 = rest_1 % ( strd * mome ); unsigned int mome_i = rest_2 / strd; unsigned int strd_i = rest_2 % strd; - unsigned int inpos = - ( page_i * strd + strd_i ) // event number - * ( part * mome ) // event size (pos of event) - + part_i * mome // particle inside event - + mome_i; // momentum inside particle - if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) - out[pos] = in[inpos]; // F2C (Fortran to C) + unsigned int inpos = ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) else out[inpos] = in[pos]; // C2F (C to Fortran) } } else { - // AV attempt another implementation: this is slightly faster (better c++ pipelining?) - // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] - // F-style: AOS[nevtF][nparF][np4F] - // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + // AV attempt another implementation: this is slightly faster (better c++ + // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA + // conversion: if neppM=1, a memcpy is enough] F-style: + // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with + // nevt=npagM*neppM constexpr unsigned int npar = CPPProcess::npar; constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; @@ -521,14 +580,16 @@ namespace mg5amcCpu else { const unsigned int npagM = nevt / neppM; - assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + assert( nevt % neppM == + 0 ); // number of events is not a multiple of neppM??? for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) for( unsigned int ipar = 0; ipar < npar; ipar++ ) for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) { unsigned int ievt = ipagM * neppM + ieppM; - unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + + ipar * np4 * neppM + ip4 * neppM + ieppM; unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; if constexpr( F2C ) out[cpos] = in[fpos]; // F2C (Fortran to C) @@ -554,5 +615,5 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- -} +} // namespace mg5amcGpu #endif // BRIDGE_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..5ede45b123 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,27 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_blasHandle() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +352,80 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; +#ifndef MGONGPU_HAS_NO_BLAS + if( m_blasHandle ) gpuBlasDestroy( m_blasHandle ); +#endif + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +442,55 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream) + if( m_blasColorSum ) + { + checkGpuBlas( gpuBlasCreate( &m_blasHandle ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +498,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* pBlasHandle = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +518,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..16f8874888 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,21 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +217,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The cuBLAS/hipBLAS handle (a single one for all good helicities) + gpuBlasHandle_t m_blasHandle; +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h index 5bd3053393..c5e79dc1b1 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index 6867c6d67d..81057d8134 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_MSSM_SLHA2.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,9 +98,10 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) @@ -106,10 +109,7 @@ namespace mg5amcCpu using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,43 +168,99 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 + __global__ void /* clang-format off */ + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] +#ifdef MGONGPUCPP_GPUIMPL + fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#else + cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel + fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) #endif - ) + const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -216,7 +272,6 @@ namespace mg5amcCpu #else using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events @@ -225,14 +280,17 @@ namespace mg5amcCpu using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif -#endif /* clang-format on */ +#endif mgDebug( 0, __FUNCTION__ ); //bool debug = true; #ifndef MGONGPUCPP_GPUIMPL //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); + //if( debug ) printf( "calculate_jamps: ievt00=%d ihel=%2d\n", ievt00, ihel ); +#else + //const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + //debug = ( ievt == 0 ); + //if( debug ) printf( "calculate_jamps: ievt=%6d ihel=%2d\n", ievt, ihel ); +#endif /* clang-format on */ // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... @@ -258,14 +316,10 @@ namespace mg5amcCpu // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // START LOOP ON IPARITY for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif @@ -289,7 +343,6 @@ namespace mg5amcCpu const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = allNumerators; fptype* denominators = allDenominators; @@ -303,7 +356,6 @@ namespace mg5amcCpu //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); @@ -314,6 +366,10 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); +#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); @@ -365,154 +421,43 @@ namespace mg5amcCpu jamp_sv[1] -= amp_sv[0]; // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifndef MGONGPUCPP_GPUIMPL if( jamp2_sv ) // disable color choice if nullptr + { for( int icol = 0; icol < ncolor; icol++ ) jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + } +#else /* clang-format off */ + assert( iParity == 0 ); // sanity check for J2_ACCESS + using J2_ACCESS = DeviceAccessJamp2; + if( colAllJamp2s ) // disable color choice if nullptr { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) ); } - fptype_sv deltaMEs_previous = { 0 }; +#endif /* clang-format on */ #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif + // *** PREPARE OUTPUT JAMPS *** +#ifdef MGONGPUCPP_GPUIMPL + //printf( "calculate_jamps: dcNGoodHel=%d\n", dcNGoodHel ); + // In CUDA, copy the local jamp to the output global-memory jamp + constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_ + using J_ACCESS = DeviceAccessJamp; for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // In C++, copy the local jamp to the output array passed as function argument + for( int icol = 0; icol < ncolor; icol++ ) + allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol]; #endif - } - - // *** STORE THE RESULTS *** + } + // END LOOP ON IPARITY - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY mgDebug( 1, __FUNCTION__ ); return; } @@ -552,7 +497,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -585,6 +534,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -625,6 +578,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_MSSM_SLHA2::ZERO ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -745,8 +702,8 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -754,25 +711,41 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype_sv* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ + void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads // input: cuda gputhreads +#else +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { @@ -917,13 +1052,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -935,18 +1064,23 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + // *** PART 0a - CUDA *** + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else + // *** PART 0b - C++ *** const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) { @@ -971,93 +1105,30 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } - } - else - { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) - } + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1099,7 +1170,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1122,7 +1193,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1131,21 +1202,23 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); @@ -1159,8 +1232,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1176,11 +1251,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1282,14 +1358,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h index 24c27005b8..f74d539775 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_MSSM_SLHA2.h" #include @@ -75,6 +76,7 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] @@ -122,7 +124,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -130,9 +132,11 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype_sv* allJamps, // output: jamp[ncolor*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -152,34 +156,45 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc new file mode 100644 index 0000000000..b68b9250fd --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc @@ -0,0 +1,427 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ) // input: number of good helicities + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity + cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + fptype2 jampRi = jampR[icol]; + fptype2 jampIi = jampI[icol]; + // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol + //for( int jcol = 0; jcol < ncolor; jcol++ ) + //{ + // fptype2 jampRj = jampR[jcol]; + // fptype2 jampIj = jampI[jcol]; + // ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + // ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + //} + // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix + for( int jcol = 0; jcol < icol; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampRi; + deltaMEs += ztempI * jampIi; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel + const int nhel ) // input: number of good helicities nGoodHel + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] = + allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* ghelAllMEs, // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nhel good helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#else + gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null) +#endif + const int nhel, // input: number of good helicities (nhel == nGoodHel) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of first fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt; // start of second fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer + // Convert jamps from double to float + for( int ighel = 0; ighel < nhel; ighel++ ) + { + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // jamps for a single helicity ihel + fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel ); + } + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJampsFpt2; + const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt; +#else + // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer + static_assert( std::is_same::value ); + fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer + fptype2* ghelAllMEsFpt2 = ghelAllMEs; + // Real and imaginary components + const fptype2* ghelAllJampsReal = ghelAllJamps; // this is not a cast (the two types are identical) + const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* ghelAllZtempReal = ghelAllZtempBoth; + fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt; + + // Note: striding for cuBLAS from DeviceAccessJamp: + // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] + + // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nhel*nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsReal, nevtN, // JampsV is nevtN x ncolorK + &beta1, + ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + ghelAllJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsReal, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column + ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevtN (nhel*nevt) "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + ghelAllJampsImag, nevtN, 1, // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1) + ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column + &beta2, + ghelAllMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevtN ) ); // there are nevt (nhel*nevt) "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + for( int ighel = 0; ighel < nhel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for a single helicity ihel + fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 ); + } +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + // CASE 1: KERNEL + if( !pBlasHandle ) + { + assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + // Loop over helicities + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + fptype* hAllMEs = ghelAllMEs + ighel * nevt; // MEs for one specific helicity ighel + const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel + gpuStream_t hStream = ghelStreams[ighel]; + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel ); + } + } + // CASE 2: BLAS + else + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas +#else + checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed + // Reset the tmp buffer +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); +#else + gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) ); +#endif + // Delegate the color sum to BLAS for + color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads ); +#endif + } + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/cudacpp_overlay.mk new file mode 120000 index 0000000000..181212c4c6 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/cudacpp_overlay.mk @@ -0,0 +1 @@ +../cudacpp_overlay.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.h new file mode 120000 index 0000000000..067632d2b4 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.h @@ -0,0 +1 @@ +../fbridge.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/makefile_original.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/makefile_original.mk new file mode 120000 index 0000000000..953b628165 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/makefile_original.mk @@ -0,0 +1 @@ +../makefile_original.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..9e942d3edc --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + static __device__ inline const cxtype + kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last) + return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt], + buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + const fptype* ghelAllJamps, // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int nGoodHel, // input: number of good helicities + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for one specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + const int nGoodHel ); // input: number of good helicities +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk new file mode 100644 index 0000000000..adbfcad2bf --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk @@ -0,0 +1,295 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin. +# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) + +# To be used after the project makefile +SHELL := /bin/bash + +# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829) +# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing +include ../../src/cudacpp_config.mk +ifeq ($(CUDACPP_BUILDDIR),) + $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!) +endif + +# Basic uname helpers (if not already set) +UNAME_S ?= $(shell uname -s) +UNAME_P ?= $(shell uname -p) + +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html +FFLAGS+= -cpp + +# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) +CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Add -std=c++17 explicitly to avoid build errors on macOS +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3 +endif + +# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran) +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX := ccache $(CXX) +endif + +# ---------------------------------------------------------------------- +# Backend library names and process id +# ---------------------------------------------------------------------- +CUDACPP_MAKEFILE := cudacpp.mk +processid_short := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') + +ifeq ($(BACKEND),cuda) + CUDACPP_COMMONLIB := mg5amc_common_cuda + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) + CUDACPP_COMMONLIB := mg5amc_common_hip + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip +else + CUDACPP_COMMONLIB := mg5amc_common_cpp + CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp +endif + +# ---------------------------------------------------------------------- +# Libraries and link line adjustments +# ---------------------------------------------------------------------- +# Prefer LIBDIR everywhere; base makefile already defines LIBDIR. +LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \ + -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +# OpenMP: enable only if requested, USEOPENMP=1 (#758) +ifeq ($(USEOPENMP),1) + ifneq ($(shell $(CXX) --version | egrep '^Intel'),) + override OMPFLAGS = -fopenmp + LINKLIBS += -liomp5 # see #578 + LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy' + else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override OMPFLAGS = -fopenmp + # For the *cpp* binary with clang, ensure libomp is found + $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 + else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) + override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang + else + override OMPFLAGS = -fopenmp + endif +endif + +# ---------------------------------------------------------------------- +# Objects & targets +# ---------------------------------------------------------------------- +# Keep driver* separate from PROCESS; we form DSIG groups below. +PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \ + cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \ + idenparts.o dummy_fct.o + +DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) +DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f))) + +SYMMETRY := symmetry.o idenparts.o + +# Binaries + +ifeq ($(UNAME),Darwin) + LDFLAGS += -lc++ -mmacosx-version-min=11.3 +else + LDFLAGS += -Wl,--no-relax +endif + +# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) +.DEFAULT_GOAL := all +ifeq ($(BACKEND),cuda) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda +else ifeq ($(BACKEND),hip) + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip +else + all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp +endif + +# Library build stamps +$(LIBS): .libs + +.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat + $(MAKE) -C ../../Source + touch $@ + +$(CUDACPP_BUILDDIR)/.cudacpplibs: + $(MAKE) -f $(CUDACPP_MAKEFILE) + touch $@ + +# Remove per-library recipes from makefile to avoid duplicate sub-makes +# under ../../Source running in parallel otherwise we can have race condition +# Build the libs only via the single .libs stamp. + +# Ensure these targets are satisfied by building Source once +$(LIBDIR)libmodel.$(libext) : | .libs +$(LIBDIR)libgeneric.$(libext) : | .libs +$(LIBDIR)libpdf.$(libext) : | .libs +$(LIBDIR)libgammaUPC.$(libext) : | .libs + +# Override the recipes from makefile_orig with empty recipes +# (GNU Make will use the last recipe it reads.) +$(LIBDIR)libmodel.$(libext) : ; @: +$(LIBDIR)libgeneric.$(libext) : ; @: +$(LIBDIR)libpdf.$(libext) : ; @: +$(LIBDIR)libgammaUPC.$(libext) : ; @: + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override LIBFLAGSRPATH := +else ifeq ($(USEBUILDDIR),1) + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)' +else + override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)' +endif + +# Final link steps +$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o + $(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_cuda now uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Building $(PROG)_hip also uses its own rule +$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + +# Helpers compiled with C++ +counters.o: counters.cc timer.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h + $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +# Alternate binaries (kept for parity) +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) + +gensym: $(SYMMETRY) configs.inc $(LIBS) + $(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) + +# Compile rules (override base ones) +$(MATRIX): %.o: %.f + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%.o: %.f + $(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC + +%_cudacpp.o: %.f + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ + +# Extra dependencies on discretesampler.mod +auto_dsig.o: .libs +driver.o: .libs +driver_cudacpp.o: .libs +$(MATRIX): .libs +genps.o: .libs + +# Convenience link targets to switch $(PROG) symlink +.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link +madevent_fortran_link: $(PROG)_fortran + rm -f $(PROG) + ln -s $(PROG)_fortran $(PROG) + +madevent_cuda_link: + $(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) + +madevent_hip_link: + $(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG) + +madevent_cpp_link: + $(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Variant AVX builds for cpp backend +override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +madevent_%_link: + @if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \ + echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi + $(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG) + +# Cudacpp bldall targets +ifeq ($(UNAME_P),ppc64le) + bldavxs: bldnone bldsse4 +else ifeq ($(UNAME_P),arm) + bldavxs: bldnone bldsse4 +else + bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z +endif + +ifneq ($(shell which hipcc 2>/dev/null),) + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldhip bldcuda bldavxs + else + bldall: bldhip bldavxs + endif +else + ifneq ($(shell which nvcc 2>/dev/null),) + bldall: bldcuda bldavxs + else + bldall: bldavxs + endif +endif + +bldcuda: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cuda + +bldhip: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=hip + +bldnone: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppnone + +bldsse4: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 + +bldavx2: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 + +bld512y: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512y + +bld512z: $(PROG)_fortran $(DSIG_cudacpp) + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z + +# Clean (NB: 'make clean' in Source calls 'make clean' in all P*) +clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn + $(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \ + $(CUDACPP_BUILDDIR)/$(PROG)_cpp \ + $(CUDACPP_BUILDDIR)/$(PROG)_cuda \ + $(CUDACPP_BUILDDIR)/$(PROG)_hip + +cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src + $(MAKE) -f $(CUDACPP_MAKEFILE) cleanall + rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs + rm -f .libs + +cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src + $(MAKE) -C ../../Source cleanall + rm -rf $(LIBDIR)libbias.$(libext) + rm -f ../../Source/*.mod ../../Source/*/*.mod + +distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation + $(MAKE) -f $(CUDACPP_MAKEFILE) distclean + diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h new file mode 100644 index 0000000000..7d5014a138 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.h @@ -0,0 +1,49 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin. + +#include "Bridge.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" + +#ifndef _FBRIDGE_H_ +#define _FBRIDGE_H_ + +extern "C" +{ +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + using FORTRANFPTYPE = double; + + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ); + + void fbridgedelete_( CppObjectInFortran** ppbridge ); + + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* channelIds, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool* pgoodHelOnly ); + + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel ); +} +#endif // _FBRIDGE_H_ \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/makefile_wrapper.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/makefile_wrapper.mk new file mode 100644 index 0000000000..59c862b17f --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/makefile_wrapper.mk @@ -0,0 +1,3 @@ +SHELL := /bin/bash +include makefile_original.mk +include cudacpp_overlay.mk diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc index 4eec5db13c..678eb8c34e 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h index 9ed58e24f1..f5c68fb7c4 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h @@ -8,7 +8,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc index aa00d6a9e4..0fd9310ffa 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc @@ -7,7 +7,7 @@ // Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 3e29f2ccbe..5a7f431dc1 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.5, 2025-10-17 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h index d3c4ca5695..7d34de72f8 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk index f703a1ae7c..48b2037dc2 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk @@ -1,10 +1,20 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Host detection +UNAME_S := $(shell uname -s) + +# Only add AVX2/FMA on non-mac hosts +ifeq ($(UNAME_S),Darwin) + GTEST_CMAKE_FLAGS := +else + GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" +endif + # Compiler-specific googletest build directory (#125 and #738) # In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk # In epoch1/epoch2, CXXNAMESUFFIX is undefined @@ -19,11 +29,11 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) - cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../ googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile $(MAKE) -C googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/tmad/allTees.sh b/epochX/cudacpp/tmad/allTees.sh index eb39e2b302..17367f7f6b 100755 --- a/epochX/cudacpp/tmad/allTees.sh +++ b/epochX/cudacpp/tmad/allTees.sh @@ -1,23 +1,41 @@ #!/bin/bash -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (May 2022) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. scrdir=$(cd $(dirname $0); pwd) host=$(hostname) if [ "${host/juwels}" != "${host}" ]; then ${scrdir}/juwelspatch.sh; fi # workaround for #498 +# Usage +function usage() +{ + echo "Usage (1): $0 [-short|-ggttggg] [-bsmonly|-nobsm] [-makeclean] [+10x] [-hip]" + echo "Run tests and check all logs" + echo "" + echo "Usage (2): $0 -checkonly" + echo "Check existing logs without running any tests" + exit 1 +} + +# Parse command line arguments +checkonly=0 short=0 bsm= flts=-dmf # "d m f" (alternative: -d_f i.e. "d f") makeclean= rmrdat= -add10x="+10x" +add10x= hip= - -while [ "$1" != "" ]; do +if [ "$1" == "-checkonly" ]; then + # Check existing logs without running any tests? + checkonly=1 + shift + if [ "$1" != "" ]; then usage; fi +fi +while [ "${checkonly}" == "0" ] && [ "$1" != "" ]; do if [ "$1" == "-short" ]; then short=1 # all (possibly including bsm) but ggttggg shift @@ -27,8 +45,8 @@ while [ "$1" != "" ]; do elif [ "$1" == "-makeclean" ]; then makeclean=$1 shift - elif [ "$1" == "-no10x" ]; then - add10x="" + elif [ "$1" == "+10x" ]; then + add10x=$1 shift elif [ "$1" == "-bsmonly" ] && [ "$bsm" != "-nobsm" ]; then bsm=$1 @@ -40,43 +58,73 @@ while [ "$1" != "" ]; do hip=$1 shift else - echo "Usage: $0 [-short|-ggttggg] [-bsmonly|-nobsm] [-makeclean] [-no10x] [-hip]" - exit 1 + usage fi done -started="STARTED AT $(date)" - -if [ "${bsm}" != "-bsmonly" ]; then - if [ "$short" == "1" ]; then - ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq $flts $makeclean $rmrdat $add10x $hip - elif [ "$short" == "-1" ]; then - ${scrdir}/teeMadX.sh -ggttggg $flts $makeclean $rmrdat $add10x $hip - else - ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg $flts $makeclean $rmrdat $add10x $hip +# Run all tests +if [ "${checkonly}" == "0" ]; then + started="STARTED AT $(date)" + # SM tests + if [ "${bsm}" != "-bsmonly" ]; then + if [ "$short" == "1" ]; then + ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq $flts $makeclean $rmrdat $add10x $hip + elif [ "$short" == "-1" ]; then + ${scrdir}/teeMadX.sh -ggttggg $flts $makeclean $rmrdat $add10x $hip + else + ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg $flts $makeclean $rmrdat $add10x $hip + fi fi -fi -status=$? -ended1="(SM tests)\nENDED(1) AT $(date) [Status=$status]" - -if [ "${bsm}" != "-nobsm" ]; then - if [ "$short" != "-1" ]; then - ${scrdir}/teeMadX.sh -heftggbb -susyggtt -susyggt1t1 -smeftggtttt $flts $makeclean $rmrdat $add10x $hip + status=$? + ended1="(SM tests)\nENDED(1) AT $(date) [Status=$status]" + # BSM tests + if [ "${bsm}" != "-nobsm" ]; then + if [ "$short" != "-1" ]; then + ${scrdir}/teeMadX.sh -heftggbb -susyggtt -susyggt1t1 -smeftggtttt $flts $makeclean $rmrdat $add10x $hip + fi fi + status=$? + ended2="(BSM tests)\nENDED(1) AT $(date) [Status=$status]" + # Timing information + echo + printf "\n%80s\n" |tr " " "#" + echo + echo -e "$started" + echo -e "$ended1" + echo -e "$ended2" + echo fi -status=$? -ended2="(BSM tests)\nENDED(1) AT $(date) [Status=$status]" # Print out the number of "OK!"s in each log (expect 24) +for f in ${scrdir}/logs_*_mad/log_*; do echo $(cat $f | grep OK | wc -l) $f; done # expect 24 + +# Print out any errors or aborts in the logs echo -printf "\n%80s\n" |tr " " "#" +txt=$(egrep -i '(error|abort)' tmad/logs* -r | sed 's/:0:rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/') +if [ "${txt}" == "" ]; then + echo "No errors or aborts found in logs" +else + echo "${txt}" +fi + +# Print out any asserts in the logs echo -echo -e "$started" -echo -e "$ended1" -echo -e "$ended2" +txt=$(grep assert tmad/logs* -r | sed "s/Gpu.*Assert/Assert/") +if [ "${txt}" == "" ]; then + echo "No asserts found in logs" +else + echo "${txt}" +fi + +# Print out any segfaults in the logs echo -for f in ${scrdir}/logs_*_mad/log_*; do echo $(cat $f | grep OK | wc -l) $f; done # expect 24 - +txt=$(grep -i segmentation tmad/logs* -r | sed "s/Gpu.*Assert/Assert/") +if [ "${txt}" == "" ]; then + echo "No segmentation fault found in logs" +else + echo "${txt}" +fi + # Print out the MEK channelid debugging output echo \grep MEK ${scrdir}/logs_*/* | sed "s|${scrdir}/logs_||" | sed 's|_mad.*DEBUG:||' | sort -u diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index c9c9460105..9875c9cf7a 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:04:14 +DATE: 2025-10-11_17:08:31 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7444s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7368s - [COUNTERS] Fortran MEs ( 1 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7544s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7467s + [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.07E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2176s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2101s - [COUNTERS] Fortran MEs ( 1 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2221s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2144s + [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2197s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2123s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2222s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2147s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0072s for 8192 events => throughput is 1.14E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173944E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.158620e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.149454e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.163690e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.182730e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2221s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2173s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 8192 events => throughput is 1.81E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2208s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2160s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 8192 events => throughput is 1.82E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173944E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.887925e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914270e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.991506e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.995666e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2160s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2124s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2170s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2130s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 8192 events => throughput is 2.23E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.590914e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.533255e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.667984e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.641624e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2167s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2131s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.46E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2163s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2127s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.41E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.636316e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.651338e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.730901e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.725193e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2188s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2145s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 2.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2180s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.98E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.085135e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.065060e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.218811e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.156200e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -334,10 +341,10 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6526s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6492s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.88E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.6520s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6479s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0034s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -348,44 +355,44 @@ OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789448173971E-00 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.299210e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.427727e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.632885e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.442402e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.507229e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.123576e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.868548e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.069823e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.543060e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.084747e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.911449e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.494944e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.533062e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.063740e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.164979e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.415941e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 13ceac3a87..fbf3c34fcc 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 - +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:04:39 +DATE: 2025-10-11_17:08:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7443s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7370s - [COUNTERS] Fortran MEs ( 1 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7580s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7502s + [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2183s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2108s - [COUNTERS] Fortran MEs ( 1 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2217s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2138s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432777382586498E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2266s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2197s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 8192 events => throughput is 1.21E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2214s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2142s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.18E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432777382586498E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.221258e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.197154e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.225429e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.200720e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432774839452045E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2220s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2190s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.89E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2161s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2132s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 2.99E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774839452045E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.137547e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.577999e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.221144e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.183473e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2228s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2200s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.13E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2183s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2155s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.17E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774915924193E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.328121e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.468253e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.556846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.468239e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2241s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2212s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2199s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2171s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.19E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774915924193E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.452418e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.276853e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.604389e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.494548e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432778556608516E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2173s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2144s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.08E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2182s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2152s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.90E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432778556608516E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.402847e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.354967e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.641263e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.469737e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432780016531851E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432779972212775E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6500s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6467s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.92E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.6719s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6677s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.25E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0036s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432780016531851E-002) differ by less than 4E-4 (1.0203783951112655e-07) +OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432779972212775E-002) differ by less than 4E-4 (1.0251731308308365e-07) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.451436e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.421145e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.688055e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.263812e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.014252e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.466407e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.229387e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.768150e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.787718e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.574848e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.220221e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.510215e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.380548e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.891814e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.826286e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.714240e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 093bec81e5..07ac440ea1 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:04:27 +DATE: 2025-10-11_17:08:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7605s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7527s - [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7547s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7469s + [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2221s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2144s - [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2206s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2128s + [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2212s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2248s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2169s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444986618E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.133245e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.138160e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.115304e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.141490e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2168s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2123s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2174s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2129s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.90E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444986618E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.993139e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.989196e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.058944e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.027429e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2174s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2138s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.42E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2195s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.30E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.549665e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.540266e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.708708e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.722635e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2179s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.49E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.26E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.606715e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.634053e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.748967e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.703762e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2165s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2123s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2186s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2143s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.06E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.203720e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.160546e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.284212e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.303805e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789437826970E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432789453073233E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6505s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6470s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.77E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.6515s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6475s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.22E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0033s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789437826970E-002) differ by less than 2E-4 (1.1194101201539297e-10) +OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789453073233E-002) differ by less than 2E-4 (5.3003379463234523e-11) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.269035e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.593291e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.550305e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.163347e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.523745e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.056075e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.857337e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.054571e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.551254e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.089599e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.897534e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.480305e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.503798e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.035852e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.184430e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.419141e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 794f102690..9182ca8a9b 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - - make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:04:53 +DATE: 2025-10-11_17:09:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8494s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8073s - [COUNTERS] Fortran MEs ( 1 ) : 0.0421s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8533s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8106s + [COUNTERS] Fortran MEs ( 1 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4510s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s - [COUNTERS] Fortran MEs ( 1 ) : 0.0425s for 8192 events => throughput is 1.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4516s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4087s + [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.91E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4555s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4098s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0453s for 8192 events => throughput is 1.81E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4606s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4148s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0454s for 8192 events => throughput is 1.80E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.856020e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.822539e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.865986e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.841641e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4352s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4103s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0245s for 8192 events => throughput is 3.34E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4390s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4130s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0257s for 8192 events => throughput is 3.19E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.314758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.221117e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.321531e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.252405e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4235s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4077s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.30E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4339s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4171s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0164s for 8192 events => throughput is 4.99E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.263509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.116784e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.327379e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.216981e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4237s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4087s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0147s for 8192 events => throughput is 5.58E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4313s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4153s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0156s for 8192 events => throughput is 5.24E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.648502e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.229787e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.831851e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.438042e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4297s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4071s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0222s for 8192 events => throughput is 3.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4415s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4172s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0239s for 8192 events => throughput is 3.42E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034169) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.526689e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.514185e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.574003e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.539500e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8534s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8496s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.68E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.8618s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8570s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0040s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cuda (47.138611968034176) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.138611968034176) and cuda (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.103830e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.853419e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.448285e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.409968e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.875229e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.832304e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.627647e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.660331e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.886865e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.861253e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.006782e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.014024e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.862106e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.853068e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.715892e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.417253e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 2bf2a37cc7..7fd8a9128c 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx - +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 + +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:05:22 +DATE: 2025-10-11_17:09:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8450s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8027s - [COUNTERS] Fortran MEs ( 1 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8468s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8038s + [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.91E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4516s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4094s - [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4561s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4127s + [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138606099989779] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4548s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4118s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4596s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4159s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138606099989779) differ by less than 4E-4 (1.2448487851646206e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138606099989779) differ by less than 4E-4 (1.2448487873850667e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.973574e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.924656e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.981282e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925228e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138602111070696] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4326s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4154s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0170s for 8192 events => throughput is 4.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4334s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4155s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.64E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138602111070696) differ by less than 4E-4 (2.091059336795098e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138602111070696) differ by less than 4E-4 (2.091059339015544e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.659841e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.677131e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.743814e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.687091e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4174s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4080s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0091s for 8192 events => throughput is 8.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4249s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.65E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138602499179925) differ by less than 4E-4 (2.008725722424387e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138602499179925) differ by less than 4E-4 (2.0087257257550561e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.079796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.918801e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.235810e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.134969e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4181s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4092s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.50E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4245s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0091s for 8192 events => throughput is 9.01E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138602499179925) differ by less than 4E-4 (2.008725722424387e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138602499179925) differ by less than 4E-4 (2.0087257257550561e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.970038e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.308113e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.765544e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.304031e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138606840950104] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4258s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4131s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0124s for 8192 events => throughput is 6.60E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4294s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4163s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.41E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138606840950104) differ by less than 4E-4 (1.0876612277499476e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138606840950104) differ by less than 4E-4 (1.0876612310806166e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.636236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.713633e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.862568e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.787911e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138612402172164] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138612400084860] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8671s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8634s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.65E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.8642s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8595s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.07E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cuda (47.138612402172164) differ by less than 4E-4 (9.209817353195149e-09) +OK! xsec from fortran (47.138611968034176) and cuda (47.138612400084860) differ by less than 4E-4 (9.16553677399179e-09) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.093880e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.299593e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.450343e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.634270e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.021092e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.759880e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.359313e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.744455e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.014796e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.777428e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.375647e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.990089e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.628808e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.374093e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.004427e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.364214e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 2ae843d323..e56bc4eee0 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,4 +1,7 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx make USEBUILDDIR=1 BACKEND=cuda @@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:05:08 +DATE: 2025-10-11_17:09:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8439s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8015s - [COUNTERS] Fortran MEs ( 1 ) : 0.0424s for 8192 events => throughput is 1.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8528s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8099s + [COUNTERS] Fortran MEs ( 1 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4498s + [COUNTERS] PROGRAM TOTAL : 0.4512s [COUNTERS] Fortran Overhead ( 0 ) : 0.4080s - [COUNTERS] Fortran MEs ( 1 ) : 0.0418s for 8192 events => throughput is 1.96E+05 events/s + [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613306947967] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4576s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0451s for 8192 events => throughput is 1.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4607s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4140s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 8192 events => throughput is 1.77E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613306947967) differ by less than 2E-4 (2.8403759566586473e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947967) differ by less than 2E-4 (2.8403759344541868e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.815647e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.819635e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.845071e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.820245e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613306947953] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4358s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4106s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 8192 events => throughput is 3.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4365s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4109s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0253s for 8192 events => throughput is 3.24E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613306947953) differ by less than 2E-4 (2.8403759344541868e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947953) differ by less than 2E-4 (2.8403759122497263e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.291111e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.279259e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.339005e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.279521e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4251s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4094s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4291s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4132s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.30E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593479165746e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.315398e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.322301e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.422217e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.904240e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4227s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0143s for 8192 events => throughput is 5.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4297s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.44E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593479165746e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.854463e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.558424e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.901611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.634376e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4322s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4099s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0219s for 8192 events => throughput is 3.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4402s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4164s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0235s for 8192 events => throughput is 3.49E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593479165746e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.724588e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.654630e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.694617e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.679375e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611963547788] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613294297848] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8506s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8468s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.66E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.8631s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8584s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0041s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cuda (47.138611963547788) differ by less than 2E-4 (9.517409083059647e-11) +OK! xsec from fortran (47.138611968034176) and cuda (47.138613294297848) differ by less than 2E-4 (2.8135399343653944e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.987528e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.912312e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.325954e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.471933e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.868584e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.863402e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.589038e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.634047e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.871326e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.849540e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.949192e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.953899e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.873573e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.847641e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.717025e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.416006e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 0c7ed732ed..d8d6f34ca2 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:05:36 +DATE: 2025-10-11_17:09:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7416s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4124s - [COUNTERS] Fortran MEs ( 1 ) : 0.3292s for 8192 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7558s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4158s + [COUNTERS] Fortran MEs ( 1 ) : 0.3400s for 8192 events => throughput is 2.41E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7177s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3873s - [COUNTERS] Fortran MEs ( 1 ) : 0.3304s for 8192 events => throughput is 2.48E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7272s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3869s + [COUNTERS] Fortran MEs ( 1 ) : 0.3403s for 8192 events => throughput is 2.41E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7353s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3872s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3470s for 8192 events => throughput is 2.36E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7509s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3914s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3585s for 8192 events => throughput is 2.29E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748553E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748553E-002) differ by less than 3E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.455924e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.384792e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.454100e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.379994e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5656s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3857s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1793s for 8192 events => throughput is 4.57E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5787s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3912s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1868s for 8192 events => throughput is 4.39E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748567E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748567E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.669927e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.477039e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.620836e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.489628e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4792s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3884s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0903s for 8192 events => throughput is 9.07E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4876s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3928s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0942s for 8192 events => throughput is 8.69E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.331277e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.903439e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.327490e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.886830e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4693s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3876s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0812s for 8192 events => throughput is 1.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4804s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3924s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0874s for 8192 events => throughput is 9.37E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.048553e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.779459e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.042752e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.857066e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748581E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5035s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3879s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1150s for 8192 events => throughput is 7.13E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5118s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1188s for 8192 events => throughput is 6.90E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748581E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.198283e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.951589e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.275587e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.994069e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -334,58 +341,58 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8395s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8270s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.61E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0030s + [COUNTERS] PROGRAM TOTAL : 0.8402s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8333s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0013s for 8192 events => throughput is 6.17E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0056s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cuda (7.8471485809748553E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471485809748553E-002) differ by less than 3E-14 (0.0) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111479e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.930684e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.523607e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.049354e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.454522e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.010359e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.167720e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.220373e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.412863e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.008910e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.174227e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.368579e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.441638e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.010569e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.653840e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.799070e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index fbc0c57cb4..405a8e9845 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:06:11 +DATE: 2025-10-11_17:10:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7420s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4102s - [COUNTERS] Fortran MEs ( 1 ) : 0.3318s for 8192 events => throughput is 2.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7519s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s + [COUNTERS] Fortran MEs ( 1 ) : 0.3398s for 8192 events => throughput is 2.41E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7176s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3860s - [COUNTERS] Fortran MEs ( 1 ) : 0.3316s for 8192 events => throughput is 2.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7271s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3864s + [COUNTERS] Fortran MEs ( 1 ) : 0.3408s for 8192 events => throughput is 2.40E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471473453718410E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7234s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3899s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3325s for 8192 events => throughput is 2.46E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7291s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3913s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3369s for 8192 events => throughput is 2.43E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471473453718410E-002) differ by less than 4E-4 (1.574588530672827e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471473453718410E-002) differ by less than 4E-4 (1.5745885295626039e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.535876e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.486290e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.542086e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.478806e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471459294758378E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471459219682932E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4904s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3886s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1014s for 8192 events => throughput is 8.08E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4955s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3907s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1044s for 8192 events => throughput is 7.85E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471459294758378E-002) differ by less than 4E-4 (3.37893311330717e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459219682932E-002) differ by less than 4E-4 (3.3885003380973444e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.182689e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.993300e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.204950e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.004232e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471459718665412E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471459708731872E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4358s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3891s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0464s for 8192 events => throughput is 1.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4415s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3925s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0486s for 8192 events => throughput is 1.69E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471459718665412E-002) differ by less than 4E-4 (3.324912595248364e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459708731872E-002) differ by less than 4E-4 (3.3261784726512644e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.782969e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.733359e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.783579e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.722443e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471459718665412E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471459708731872E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4301s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3871s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4378s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3922s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0452s for 8192 events => throughput is 1.81E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471459718665412E-002) differ by less than 4E-4 (3.324912595248364e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459708731872E-002) differ by less than 4E-4 (3.3261784726512644e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.968891e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.850143e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.969858e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.891286e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471471932611128E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471471746130506E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4447s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3892s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0551s for 8192 events => throughput is 1.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4526s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3929s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0592s for 8192 events => throughput is 1.38E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471471932611128E-002) differ by less than 4E-4 (1.768430569759616e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471471746130506E-002) differ by less than 4E-4 (1.792194693761573e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.481854e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.406796e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.468460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.412048e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471475012321185E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471471641207505E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8373s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8327s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.36E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 0.8323s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8265s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 8.95E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0049s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cuda (7.8471475012321185E-002) differ by less than 4E-4 (1.375968260441951e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471471641207505E-002) differ by less than 4E-4 (1.8055655381932212e-07) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.717098e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.479157e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.890243e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.067147e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.313606e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.047251e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.232701e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.860004e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.300307e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.051348e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.230438e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.997681e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.193713e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.964172e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.247962e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.785109e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 2422d3068f..b21554372e 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg - +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg make USEBUILDDIR=1 BACKEND=cuda + + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 - make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:05:53 +DATE: 2025-10-11_17:10:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7391s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4096s - [COUNTERS] Fortran MEs ( 1 ) : 0.3295s for 8192 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7553s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4138s + [COUNTERS] Fortran MEs ( 1 ) : 0.3415s for 8192 events => throughput is 2.40E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7165s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3855s - [COUNTERS] Fortran MEs ( 1 ) : 0.3310s for 8192 events => throughput is 2.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7268s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3875s + [COUNTERS] Fortran MEs ( 1 ) : 0.3393s for 8192 events => throughput is 2.41E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471486590207584E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7396s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3874s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3510s for 8192 events => throughput is 2.33E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7475s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3883s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3580s for 8192 events => throughput is 2.29E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486590207584E-002) differ by less than 2E-4 (9.945765766516956e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486590207584E-002) differ by less than 2E-4 (9.945765988561561e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.409349e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.359867e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.415956e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.360283e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486540430027E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486557993325E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5676s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3876s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1792s for 8192 events => throughput is 4.57E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3921s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1821s for 8192 events => throughput is 4.50E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486540430027E-002) differ by less than 2E-4 (9.311426296676473e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486557993325E-002) differ by less than 2E-4 (9.535244149816435e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.653483e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.570903e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.691370e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.571774e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486395956899E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486463614210E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4809s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3907s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0896s for 8192 events => throughput is 9.14E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4882s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3954s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0922s for 8192 events => throughput is 8.88E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486395956899E-002) differ by less than 2E-4 (7.470335683379403e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614210E-002) differ by less than 2E-4 (8.332525558429893e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.402724e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.192817e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.391101e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.186620e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486395956899E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486463614210E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4660s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3858s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0796s for 8192 events => throughput is 1.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4787s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3937s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0844s for 8192 events => throughput is 9.71E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486395956899E-002) differ by less than 2E-4 (7.470335683379403e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614210E-002) differ by less than 2E-4 (8.332525558429893e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.055172e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.002954e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.066925e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000380e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471486537749241E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5026s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3850s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1170s for 8192 events => throughput is 7.00E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.5085s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3899s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1179s for 8192 events => throughput is 6.95E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486537749241E-002) differ by less than 2E-4 (9.277263846030337e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486537749241E-002) differ by less than 2E-4 (9.277264068074942e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.005425e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.931283e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.056979e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.899982e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485791426987E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486543087457E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8432s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8306s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.66E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0031s + [COUNTERS] PROGRAM TOTAL : 0.8420s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8352s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0014s for 8192 events => throughput is 5.93E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0055s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cuda (7.8471485791426987E-002) differ by less than 2E-4 (2.334807902570901e-10) +OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471486543087457E-002) differ by less than 2E-4 (9.345291429596614e-09) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.128450e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.941062e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.439893e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.043050e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.421024e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.003879e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.153444e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219422e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.432988e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.007497e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.169695e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.367555e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.432146e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.012869e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.638179e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.798121e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 5517ab4292..fcf14d36a5 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:06:26 +DATE: 2025-10-11_17:10:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.6353s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3093s - [COUNTERS] Fortran MEs ( 1 ) : 4.3260s for 8192 events => throughput is 1.89E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8675s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3041s + [COUNTERS] Fortran MEs ( 1 ) : 4.5634s for 8192 events => throughput is 1.80E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.5825s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2904s - [COUNTERS] Fortran MEs ( 1 ) : 4.2921s for 8192 events => throughput is 1.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8255s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2969s + [COUNTERS] Fortran MEs ( 1 ) : 4.5287s for 8192 events => throughput is 1.81E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.7512s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.4476s for 8192 events => throughput is 1.84E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0090s + [COUNTERS] PROGRAM TOTAL : 4.8499s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2944s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.5463s for 8192 events => throughput is 1.80E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0092s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240192) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.894558e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.855071e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.891638e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.864869e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 2.6638s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2927s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3665s for 8192 events => throughput is 3.46E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0045s + [COUNTERS] PROGRAM TOTAL : 2.8407s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2953s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.5401s for 8192 events => throughput is 3.23E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0053s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240192) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.547129e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.391185e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.542201e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.371248e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.3332s + [COUNTERS] PROGRAM TOTAL : 1.3634s [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0356s for 8192 events => throughput is 7.91E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0025s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0657s for 8192 events => throughput is 7.69E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0026s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.118919e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.818945e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.114943e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.888581e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.2181s + [COUNTERS] PROGRAM TOTAL : 1.2373s [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9207s for 8192 events => throughput is 8.90E+03 events/s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9400s for 8192 events => throughput is 8.71E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.276674e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.864841e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.241984e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.851817e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.4646s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2912s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1708s for 8192 events => throughput is 7.00E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0026s + [COUNTERS] PROGRAM TOTAL : 1.5242s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2959s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2254s for 8192 events => throughput is 6.69E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0029s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.083404e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.755860e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.099846e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.706109e+03 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.8110s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7374s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0388s for 8192 events => throughput is 2.11E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0348s + [COUNTERS] PROGRAM TOTAL : 0.7754s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7315s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.26E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0246s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786561240192) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786561240197) differ by less than 3E-14 (0.0) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.149005e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.416533e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.350783e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.462010e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.129093e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.359331e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.172100e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.449399e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.126645e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.367790e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.170032e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.440795e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.144552e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.383135e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.426547e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.480569e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 78567e12c9..5c635cc8ef 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:08:49 +DATE: 2025-10-11_17:12:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.5864s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2940s - [COUNTERS] Fortran MEs ( 1 ) : 4.2923s for 8192 events => throughput is 1.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8704s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2988s + [COUNTERS] Fortran MEs ( 1 ) : 4.5716s for 8192 events => throughput is 1.79E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.5924s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2886s - [COUNTERS] Fortran MEs ( 1 ) : 4.3038s for 8192 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8250s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2965s + [COUNTERS] Fortran MEs ( 1 ) : 4.5284s for 8192 events => throughput is 1.81E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144941544531159] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144941326459554] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.6210s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2941s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3184s for 8192 events => throughput is 1.90E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0085s + [COUNTERS] PROGRAM TOTAL : 4.7411s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.4378s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0087s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941544531159) differ by less than 4E-4 (4.675947774535061e-06) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941326459554) differ by less than 4E-4 (4.669368411036601e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.957206e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.908171e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.957921e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.916943e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144937378275385] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.4924s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2933s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1966s for 8192 events => throughput is 6.85E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0025s + [COUNTERS] PROGRAM TOTAL : 1.5212s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2931s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2254s for 8192 events => throughput is 6.68E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0027s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144937378275385) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.048957e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.792707e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.041651e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.847129e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.8128s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2926s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5189s for 8192 events => throughput is 1.58E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8295s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5336s for 8192 events => throughput is 1.54E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939353225550) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.622272e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.560155e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.613287e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.556326e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7779s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2950s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4817s for 8192 events => throughput is 1.70E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0012s + [COUNTERS] PROGRAM TOTAL : 0.7790s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2954s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4823s for 8192 events => throughput is 1.70E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939353225550) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.826080e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.756110e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.802534e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.758530e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144947551388249] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.8771s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2920s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5836s for 8192 events => throughput is 1.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9014s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6052s for 8192 events => throughput is 1.35E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144947551388249) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.430502e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.375609e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.421428e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.357712e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144955535316123] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144804761684321] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7866s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7350s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0271s for 8192 events => throughput is 3.02E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0245s + [COUNTERS] PROGRAM TOTAL : 0.7725s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7390s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0108s for 8192 events => throughput is 7.56E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0227s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cuda (0.33144955535316123) differ by less than 4E-4 (5.0980589545446264e-06) +OK! xsec from fortran (0.33144786561240197) and cuda (0.33144804761684321) differ by less than 4E-4 (5.491193642015446e-07) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.089397e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.844164e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.388762e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.016020e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.126017e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.967323e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.254976e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.138637e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.087410e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.960156e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.221892e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.136855e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.084262e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.944572e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.392382e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.273692e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 0f7d6f4131..2f61c77e8d 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,4 +1,7 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg make USEBUILDDIR=1 BACKEND=cuda @@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:07:37 +DATE: 2025-10-11_17:11:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.5989s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2922s - [COUNTERS] Fortran MEs ( 1 ) : 4.3067s for 8192 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8471s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2979s + [COUNTERS] Fortran MEs ( 1 ) : 4.5492s for 8192 events => throughput is 1.80E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.6012s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2899s - [COUNTERS] Fortran MEs ( 1 ) : 4.3113s for 8192 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8278s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2989s + [COUNTERS] Fortran MEs ( 1 ) : 4.5289s for 8192 events => throughput is 1.81E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786734542164] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8059s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2941s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.5027s for 8192 events => throughput is 1.82E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.9193s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.6155s for 8192 events => throughput is 1.77E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0091s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786734542164) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.881337e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.840344e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.867505e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.842142e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786651655289] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 2.6829s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2920s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3862s for 8192 events => throughput is 3.43E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s + [COUNTERS] PROGRAM TOTAL : 2.7307s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2968s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4288s for 8192 events => throughput is 3.37E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786651655289) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.548157e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.428088e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.537868e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.464566e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.3285s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2936s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0325s for 8192 events => throughput is 7.93E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0024s + [COUNTERS] PROGRAM TOTAL : 1.3474s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2970s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0479s for 8192 events => throughput is 7.82E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0025s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.171504e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.942226e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.183239e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.692396e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.1999s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2927s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9051s for 8192 events => throughput is 9.05E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.2106s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9138s for 8192 events => throughput is 8.96E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.165581e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.272414e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.350878e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.142833e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.4750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2928s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1793s for 8192 events => throughput is 6.95E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0029s + [COUNTERS] PROGRAM TOTAL : 1.5269s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3007s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2234s for 8192 events => throughput is 6.70E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.035517e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.830218e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.843003e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.809509e+03 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786533876569] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786716305458] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.8136s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7401s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0389s for 8192 events => throughput is 2.11E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0347s + [COUNTERS] PROGRAM TOTAL : 0.7808s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7376s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.27E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0240s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786533876569) differ by less than 2E-4 (8.255786054789382e-10) +OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786716305458) differ by less than 2E-4 (4.6784207619055e-09) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.142259e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.383309e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.350796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.484069e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.127674e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.409887e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.154284e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.456801e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.123213e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.362526e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.173815e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.463078e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.121978e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.357037e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.416494e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.491061e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 74862dd5f7..fe6b10b3d3 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppnone - +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:10:25 +DATE: 2025-10-11_17:13:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 100.9475s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5365s - [COUNTERS] Fortran MEs ( 1 ) : 100.4109s for 8192 events => throughput is 8.16E+01 events/s + [COUNTERS] PROGRAM TOTAL : 102.2505s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5363s + [COUNTERS] Fortran MEs ( 1 ) : 101.7141s for 8192 events => throughput is 8.05E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 100.8105s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5296s - [COUNTERS] Fortran MEs ( 1 ) : 100.2810s for 8192 events => throughput is 8.17E+01 events/s + [COUNTERS] PROGRAM TOTAL : 102.2069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5320s + [COUNTERS] Fortran MEs ( 1 ) : 101.6749s for 8192 events => throughput is 8.06E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282475E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 127.1376s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5284s - [COUNTERS] CudaCpp MEs ( 2 ) : 126.4018s for 8192 events => throughput is 6.48E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2074s + [COUNTERS] PROGRAM TOTAL : 128.7427s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5353s + [COUNTERS] CudaCpp MEs ( 2 ) : 127.9956s for 8192 events => throughput is 6.40E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2118s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282475E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.678586e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.580483e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.694101e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.620995e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 61.7097s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5331s - [COUNTERS] CudaCpp MEs ( 2 ) : 61.0765s for 8192 events => throughput is 1.34E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1001s + [COUNTERS] PROGRAM TOTAL : 69.6189s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5284s + [COUNTERS] CudaCpp MEs ( 2 ) : 68.9781s for 8192 events => throughput is 1.19E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1125s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.591189e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.424482e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.580161e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.419676e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 29.3577s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5263s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.7837s for 8192 events => throughput is 2.85E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0477s + [COUNTERS] PROGRAM TOTAL : 30.3572s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5354s + [COUNTERS] CudaCpp MEs ( 2 ) : 29.7726s for 8192 events => throughput is 2.75E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0492s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.407090e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.296671e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.415212e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.296231e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.2469s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5271s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.6788s for 8192 events => throughput is 3.19E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0410s + [COUNTERS] PROGRAM TOTAL : 26.8666s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5340s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2902s for 8192 events => throughput is 3.12E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0424s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.913687e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.796432e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.895964e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.783837e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.1607s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5255s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.5871s for 8192 events => throughput is 3.20E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0481s + [COUNTERS] PROGRAM TOTAL : 27.2211s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5330s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.6390s for 8192 events => throughput is 3.08E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0491s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.408791e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.322007e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.444614e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.342992e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282475E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282422E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 3.3131s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1215s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1040s for 8192 events => throughput is 7.42E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 1.0875s + [COUNTERS] PROGRAM TOTAL : 2.0387s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0768s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6155s for 8192 events => throughput is 1.33E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.3464s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561551282475E-007) differ by less than 3E-14 (2.4424906541753444e-15) +OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561551282422E-007) differ by less than 3E-14 (2.220446049250313e-16) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.491511e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336265e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.275455e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.298842e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.282089e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.363941e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.552042e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.311264e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.301465e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.338602e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.448921e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323398e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.252906e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336359e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.241973e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.336023e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index bfa4b4cda4..da0706ada3 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:42:40 +DATE: 2025-10-11_17:46:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 100.8152s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5282s - [COUNTERS] Fortran MEs ( 1 ) : 100.2871s for 8192 events => throughput is 8.17E+01 events/s + [COUNTERS] PROGRAM TOTAL : 102.9219s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5367s + [COUNTERS] Fortran MEs ( 1 ) : 102.3853s for 8192 events => throughput is 8.00E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 100.7247s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5322s - [COUNTERS] Fortran MEs ( 1 ) : 100.1925s for 8192 events => throughput is 8.18E+01 events/s + [COUNTERS] PROGRAM TOTAL : 102.9948s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5407s + [COUNTERS] Fortran MEs ( 1 ) : 102.4541s for 8192 events => throughput is 8.00E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -100,7 +107,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 @@ -108,30 +114,30 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575849446922190E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575849511111252E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 112.7914s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5240s - [COUNTERS] CudaCpp MEs ( 2 ) : 112.0829s for 8192 events => throughput is 7.31E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1845s + [COUNTERS] PROGRAM TOTAL : 116.5594s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5371s + [COUNTERS] CudaCpp MEs ( 2 ) : 115.8332s for 8192 events => throughput is 7.07E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1891s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849446922190E-007) differ by less than 4E-4 (0.00013947977747852391) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849511111252E-007) differ by less than 4E-4 (0.00013948250052009392) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.631916e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.535383e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.625132e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.441970e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -146,7 +152,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 @@ -156,10 +161,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.358e-07 [2.3575845178322101E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 28.7980s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5271s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.2235s for 8192 events => throughput is 2.90E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0474s + [COUNTERS] PROGRAM TOTAL : 31.5456s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5700s + [COUNTERS] CudaCpp MEs ( 2 ) : 30.9224s for 8192 events => throughput is 2.65E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0531s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -170,14 +175,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845178322101E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.386203e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.071038e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.374145e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.043650e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -192,7 +197,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 @@ -202,10 +206,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 14.8120s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5245s - [COUNTERS] CudaCpp MEs ( 2 ) : 14.2638s for 8192 events => throughput is 5.74E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0236s + [COUNTERS] PROGRAM TOTAL : 15.3844s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5370s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.8227s for 8192 events => throughput is 5.53E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0247s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -216,14 +220,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845169411084E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.872770e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.685687e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.864576e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.672269e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -238,7 +242,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 @@ -248,10 +251,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 13.3091s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5262s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.7618s for 8192 events => throughput is 6.42E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0211s + [COUNTERS] PROGRAM TOTAL : 13.6990s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5329s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.1447s for 8192 events => throughput is 6.23E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0214s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -262,14 +265,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845169411084E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.728743e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.552784e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.768099e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.581015e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,7 +287,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 @@ -294,10 +296,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.358e-07 [2.3575850859831750E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 13.2286s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5280s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.6780s for 8192 events => throughput is 6.46E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0225s + [COUNTERS] PROGRAM TOTAL : 13.9360s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5476s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.3630s for 8192 events => throughput is 6.13E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0254s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -308,14 +310,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575850859831750E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.948019e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.686443e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.969717e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.667526e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -337,60 +339,60 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575862304433055E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572568120113116E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 2.2079s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1084s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5470s for 8192 events => throughput is 1.50E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.5524s + [COUNTERS] PROGRAM TOTAL : 1.5254s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0122s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2322s for 8192 events => throughput is 3.53E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2811s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3575862304433055E-007) differ by less than 4E-4 (0.00014002522141920437) +OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572568120113116E-007) differ by less than 4E-4 (2.78664271879947e-07) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.517499e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.547134e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.545233e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.607921e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.140576e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.571279e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.181453e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.601694e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.126165e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.579531e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.164632e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.607459e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.163932e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.584591e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.073078e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.996351e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 3a68950921..972fcc6999 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,4 +1,7 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg make USEBUILDDIR=1 BACKEND=cuda @@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:26:37 +DATE: 2025-10-11_17:30:19 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 101.1381s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5302s - [COUNTERS] Fortran MEs ( 1 ) : 100.6080s for 8192 events => throughput is 8.14E+01 events/s + [COUNTERS] PROGRAM TOTAL : 102.1691s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5343s + [COUNTERS] Fortran MEs ( 1 ) : 101.6348s for 8192 events => throughput is 8.06E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 100.8808s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5357s - [COUNTERS] Fortran MEs ( 1 ) : 100.3451s for 8192 events => throughput is 8.16E+01 events/s + [COUNTERS] PROGRAM TOTAL : 102.2057s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5327s + [COUNTERS] Fortran MEs ( 1 ) : 101.6729s for 8192 events => throughput is 8.06E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561678995975E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 123.7239s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5356s - [COUNTERS] CudaCpp MEs ( 2 ) : 122.9787s for 8192 events => throughput is 6.66E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2095s + [COUNTERS] PROGRAM TOTAL : 130.3996s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5377s + [COUNTERS] CudaCpp MEs ( 2 ) : 129.6472s for 8192 events => throughput is 6.32E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2147s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561678995975E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.634632e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.490256e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.608909e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.489525e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561701257335E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 64.5975s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5274s - [COUNTERS] CudaCpp MEs ( 2 ) : 63.9661s for 8192 events => throughput is 1.28E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1041s + [COUNTERS] PROGRAM TOTAL : 64.8540s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5288s + [COUNTERS] CudaCpp MEs ( 2 ) : 64.2213s for 8192 events => throughput is 1.28E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1039s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561701257335E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.549992e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.563988e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.544779e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.529721e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 28.6856s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5254s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.1150s for 8192 events => throughput is 2.91E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0453s + [COUNTERS] PROGRAM TOTAL : 28.8286s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5327s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.2496s for 8192 events => throughput is 2.90E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0463s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.581303e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.534195e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.574698e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.569719e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 24.6205s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5315s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.0503s for 8192 events => throughput is 3.41E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0387s + [COUNTERS] PROGRAM TOTAL : 26.1574s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5395s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.5773s for 8192 events => throughput is 3.20E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0406s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.161373e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.054403e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.184852e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.039174e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 25.7441s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5280s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.1699s for 8192 events => throughput is 3.25E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0462s + [COUNTERS] PROGRAM TOTAL : 26.7057s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5352s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.1230s for 8192 events => throughput is 3.14E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0475s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.516660e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.438352e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.515216e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.447842e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561518129465E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561670766515E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 2.8461s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0822s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8795s for 8192 events => throughput is 9.31E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.8844s + [COUNTERS] PROGRAM TOTAL : 1.8201s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0131s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4965s for 8192 events => throughput is 1.65E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.3105s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561518129465E-007) differ by less than 2E-4 (1.4064212017217415e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561670766515E-007) differ by less than 2E-4 (5.0687787300773834e-09) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.415473e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.664884e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.080771e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.607592e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.106752e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.667090e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.156598e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.595955e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.106849e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.655497e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.103409e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.622539e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111142e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.675870e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.667428e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.460940e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 7310cfc72a..7c2d5d02c8 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:09:42 +DATE: 2025-10-11_17:13:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5319s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4597s - [COUNTERS] Fortran MEs ( 1 ) : 0.0722s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5482s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4745s + [COUNTERS] Fortran MEs ( 1 ) : 0.0736s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4765s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4047s - [COUNTERS] Fortran MEs ( 1 ) : 0.0718s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4930s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4192s + [COUNTERS] Fortran MEs ( 1 ) : 0.0739s for 8192 events => throughput is 1.11E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737132] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4865s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4077s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0781s for 8192 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4901s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4103s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0791s for 8192 events => throughput is 1.04E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737132) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.073164e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055904e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.079140e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.064104e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4492s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4062s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0425s for 8192 events => throughput is 1.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4528s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737170) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.895347e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.868596e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.917908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.882630e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4396s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4134s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0257s for 8192 events => throughput is 3.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4341s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4076s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0260s for 8192 events => throughput is 3.16E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.340027e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.217719e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.307491e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.250909e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4310s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4082s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0224s for 8192 events => throughput is 3.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4367s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4117s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0245s for 8192 events => throughput is 3.34E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.693677e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.377107e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.718907e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.445554e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4438s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4093s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0340s for 8192 events => throughput is 2.41E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4456s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4100s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0350s for 8192 events => throughput is 2.34E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.386493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.314404e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.395890e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.349276e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -334,10 +341,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737173] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8495s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8451s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.52E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 0.8613s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8556s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.03E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0049s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -348,44 +355,44 @@ OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504505737173) diffe OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.777000e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.568159e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.265214e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.455155e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.327919e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.192502e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.161258e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.014422e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.316740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.214633e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.319766e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.430009e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.323054e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.226812e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.646948e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.646817e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 748c92b28c..2376b74b06 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu + make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:10:11 +DATE: 2025-10-11_17:13:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5240s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4523s - [COUNTERS] Fortran MEs ( 1 ) : 0.0718s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5325s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4601s + [COUNTERS] Fortran MEs ( 1 ) : 0.0724s for 8192 events => throughput is 1.13E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4796s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4074s - [COUNTERS] Fortran MEs ( 1 ) : 0.0721s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4871s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s + [COUNTERS] Fortran MEs ( 1 ) : 0.0728s for 8192 events => throughput is 1.13E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313506133732837] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4786s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4057s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0723s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4843s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313506133732837) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.132089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108850e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.123977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108803e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313502997679400] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4346s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4073s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0269s for 8192 events => throughput is 3.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4377s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4101s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0272s for 8192 events => throughput is 3.01E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502997679400) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.016574e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.944992e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.049161e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.961979e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4231s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4098s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0130s for 8192 events => throughput is 6.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4227s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0138s for 8192 events => throughput is 5.95E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502619857851) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.215183e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.824085e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.201945e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.049332e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4177s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4051s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4225s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4090s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.21E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502619857851) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.557168e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.355595e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.659565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.395017e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313505300145301] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4231s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4064s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0163s for 8192 events => throughput is 5.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4271s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0179s for 8192 events => throughput is 4.58E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313505300145301) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.736521e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.628365e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.799657e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.648318e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313508590887899] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313508404553540] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8496s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8457s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.64E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s + [COUNTERS] PROGRAM TOTAL : 0.8566s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8514s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.16E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0044s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cuda (0.20313508590887899) differ by less than 4E-4 (2.011051698502797e-07) +OK! xsec from fortran (0.20313504505737126) and cuda (0.20313508404553540) differ by less than 4E-4 (1.9193223965707773e-07) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.049327e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.202405e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.339018e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.296000e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.110522e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.115794e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.423874e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.024681e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.090502e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.134420e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.757351e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.104635e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.720065e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.797328e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.206204e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.751422e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index dd13a39319..cf138d100f 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone - +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:09:56 +DATE: 2025-10-11_17:13:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5254s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4537s - [COUNTERS] Fortran MEs ( 1 ) : 0.0717s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5311s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4584s + [COUNTERS] Fortran MEs ( 1 ) : 0.0727s for 8192 events => throughput is 1.13E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4842s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4128s - [COUNTERS] Fortran MEs ( 1 ) : 0.0714s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4848s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4122s + [COUNTERS] Fortran MEs ( 1 ) : 0.0726s for 8192 events => throughput is 1.13E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504495344831] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4899s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4117s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0776s for 8192 events => throughput is 1.06E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.4868s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4073s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0788s for 8192 events => throughput is 1.04E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504495344831) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.073352e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054873e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.073996e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.059290e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504495344833] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504500016025] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4513s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0421s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4535s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4098s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0431s for 8192 events => throughput is 1.90E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504495344833) differ by less than 2E-4 (5.115952106393706e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504500016025) differ by less than 2E-4 (2.816402666638851e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.886911e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.896659e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.898728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.911870e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510700500] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4424s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4165s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0255s for 8192 events => throughput is 3.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4326s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4072s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 8192 events => throughput is 3.28E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510700500) differ by less than 2E-4 (2.4433854939331923e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.243245e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.285561e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.311888e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331125e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510700500] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4308s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 8192 events => throughput is 3.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4323s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0238s for 8192 events => throughput is 3.44E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510700500) differ by less than 2E-4 (2.4433854939331923e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.793279e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.491118e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.775522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.400822e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510700500] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4486s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4131s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0350s for 8192 events => throughput is 2.34E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4453s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4096s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0351s for 8192 events => throughput is 2.33E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510700500) differ by less than 2E-4 (2.4433854939331923e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.316706e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.392779e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.334216e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.391910e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504512110778] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504511630270] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8511s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8469s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.63E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 0.8562s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8507s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.04E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504512110778) differ by less than 2E-4 (3.1376434783680907e-10) +OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504511630270) differ by less than 2E-4 (2.9010971402954056e-10) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.929266e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558045e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.319589e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.456934e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.340652e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.187313e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.169068e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.035767e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.326566e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.212826e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.337296e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.409792e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.337938e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.225960e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.656612e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.646014e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index d2a669114e..2e04a004a3 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx - -make USEBUILDDIR=1 BACKEND=cuda +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:54:40 +DATE: 2025-10-11_17:58:37 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 0.9766s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9291s - [COUNTERS] Fortran MEs ( 1 ) : 0.0475s for 8192 events => throughput is 1.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0898s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0409s + [COUNTERS] Fortran MEs ( 1 ) : 0.0488s for 8192 events => throughput is 1.68E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4581s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4105s - [COUNTERS] Fortran MEs ( 1 ) : 0.0476s for 8192 events => throughput is 1.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4945s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4458s + [COUNTERS] Fortran MEs ( 1 ) : 0.0487s for 8192 events => throughput is 1.68E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755170] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4592s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0501s for 8192 events => throughput is 1.63E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.5064s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4538s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0521s for 8192 events => throughput is 1.57E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755170) differ b OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.648377e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.624855e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.642355e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.621541e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4344s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4065s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0275s for 8192 events => throughput is 2.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4797s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4512s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0281s for 8192 events => throughput is 2.91E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755183) differ b OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.984151e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.925389e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.017550e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.958081e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4261s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4709s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4533s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.75E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ b OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.938014e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.831423e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.942444e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.833351e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4299s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0153s for 8192 events => throughput is 5.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4705s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4537s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0165s for 8192 events => throughput is 4.97E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ b OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.398535e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.130791e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.466636e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.171570e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755179] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4391s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4149s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0238s for 8192 events => throughput is 3.45E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4789s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4536s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 8192 events => throughput is 3.30E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755179) differ b OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.480162e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.370093e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.526547e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.372925e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755192] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081479755196] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.8532s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8493s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.62E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.8974s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8926s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.14E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0041s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081479755192) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081479755196) differ by less than 3E-14 (6.661338147750939e-16) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.920216e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.725729e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.457557e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.044433e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.816989e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.665417e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.149758e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.597159e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.802618e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.632530e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.511448e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.850879e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.832166e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.607978e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.514724e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.211181e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index 483bc4166c..b05e5697ad 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:55:09 +DATE: 2025-10-11_17:59:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 0.9638s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9156s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0937s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0443s + [COUNTERS] Fortran MEs ( 1 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4563s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s - [COUNTERS] Fortran MEs ( 1 ) : 0.0478s for 8192 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4992s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4492s + [COUNTERS] Fortran MEs ( 1 ) : 0.0500s for 8192 events => throughput is 1.64E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -107,26 +114,27 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160406825242951] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160406822335140] fbridge_mode=1 [UNWEIGHT] Wrote 1653 events (found 1658 events) - [COUNTERS] PROGRAM TOTAL : 0.4552s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4076s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0473s for 8192 events => throughput is 1.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5029s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4535s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0491s for 8192 events => throughput is 1.67E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160406825242951) differ by less than 4E-4 (1.6138103811513815e-05) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160406822335140) differ by less than 4E-4 (1.613795957533526e-05) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! -diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20 -7562,7575d7561 -< 4 1 1E-03 0.1250010E+03 0.7546771E-02 0.1235066E+00 +diff /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20 +8102,8116d8101 +< 5 1 1E-03 0.1250010E+03 0.7546771E-02 0.1235066E+00 < 21 -1 0 0 503 502 0.00000000000E+00 0.00000000000E+00 0.71320499473E+02 0.71320499473E+02 0.00000000000E+00 0. 1. < 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.54771239790E+02 0.54771239790E+02 0.00000000000E+00 0. 1. -< 5 1 1 2 501 0 0.50303102232E+02 0.36190119942E+02 0.14973002893E+02 0.63925016162E+02 0.47000000000E+01 0. -1. -< -5 1 1 2 0 501 -0.50303102232E+02 -0.36190119942E+02 0.15762567893E+01 0.62166723101E+02 0.47000000000E+01 0. -1. +< 25 2 1 2 0 0 0.00000000000E+00 0.00000000000E+00 0.16549259682E+02 0.12609173926E+03 0.12500099485E+03 0. 0. +< 5 1 3 3 501 0 0.50303102232E+02 0.36190119942E+02 0.14973002893E+02 0.63925016162E+02 0.47000000000E+01 0. -1. +< -5 1 3 3 0 501 -0.50303102232E+02 -0.36190119942E+02 0.15762567893E+01 0.62166723101E+02 0.47000000000E+01 0. -1. < < 0 0.12500099E+03 < 0 diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index b61563e796..a81624efdc 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx - +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:54:54 +DATE: 2025-10-11_17:58:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 0.9594s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9118s - [COUNTERS] Fortran MEs ( 1 ) : 0.0475s for 8192 events => throughput is 1.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0919s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0436s + [COUNTERS] Fortran MEs ( 1 ) : 0.0483s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4589s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4111s - [COUNTERS] Fortran MEs ( 1 ) : 0.0478s for 8192 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4974s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4479s + [COUNTERS] Fortran MEs ( 1 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -100,7 +107,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 @@ -108,33 +114,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081964453331] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081963935692] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4600s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4089s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0507s for 8192 events => throughput is 1.62E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.5020s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4502s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0513s for 8192 events => throughput is 1.60E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964453331) differ by less than 2E-4 (2.4042469792817656e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081963935692) differ by less than 2E-4 (2.401679322083794e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.539881e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.533252e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.532971e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.529423e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -149,7 +152,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 @@ -157,33 +159,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081964453336] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081964477738] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4363s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4080s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0279s for 8192 events => throughput is 2.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4812s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4523s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0285s for 8192 events => throughput is 2.88E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964453336) differ by less than 2E-4 (2.404247001486226e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964477738) differ by less than 2E-4 (2.4043680380003707e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.824636e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.789074e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.869373e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.799101e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +197,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 @@ -206,33 +204,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081962974745] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4311s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4138s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0169s for 8192 events => throughput is 4.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4709s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4532s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.73E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081962974745) differ by less than 2E-4 (2.3969127349587893e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.809707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.670071e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.724204e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.743283e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -247,7 +242,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 @@ -255,33 +249,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081962974745] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4252s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4093s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0155s for 8192 events => throughput is 5.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4728s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4554s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.80E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081962974745) differ by less than 2E-4 (2.3969127349587893e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.163712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.832111e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.204514e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.036692e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,7 +287,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 @@ -304,33 +294,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081962970020] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081981445623] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4306s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4058s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0244s for 8192 events => throughput is 3.36E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4774s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4523s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.32E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081962970020) differ by less than 2E-4 (2.3968893092529697e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981445623) differ by less than 2E-4 (2.4885338012481384e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.121651e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.244912e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.119023e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.260859e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -352,60 +339,60 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081483021330] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081952642219] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.8574s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8536s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.63E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.9023s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8974s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081483021330) differ by less than 2E-4 (1.6201062713605552e-10) +OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081952642219) differ by less than 2E-4 (2.345660332636612e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.018963e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.648200e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.363694e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.088314e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.820757e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.635192e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.067644e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.596149e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.797704e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.579204e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.465309e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.870733e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.821262e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.605252e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.503862e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.211048e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index d3cb91b8cd..ee647bf095 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:56:37 +DATE: 2025-10-11_18:00:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.6766s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3643s - [COUNTERS] Fortran MEs ( 1 ) : 2.3123s for 8192 events => throughput is 3.54E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7275s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3706s + [COUNTERS] Fortran MEs ( 1 ) : 2.3569s for 8192 events => throughput is 3.48E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.6640s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3633s - [COUNTERS] Fortran MEs ( 1 ) : 2.3007s for 8192 events => throughput is 3.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7259s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3684s + [COUNTERS] Fortran MEs ( 1 ) : 2.3575s for 8192 events => throughput is 3.47E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.8505s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3633s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4822s for 8192 events => throughput is 3.30E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s + [COUNTERS] PROGRAM TOTAL : 2.8149s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3695s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4402s for 8192 events => throughput is 3.36E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0051s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.457369e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.441343e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.441555e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.445366e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728610E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.6655s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3645s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2984s for 8192 events => throughput is 6.31E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0027s + [COUNTERS] PROGRAM TOTAL : 1.7137s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3713s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3396s for 8192 events => throughput is 6.12E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728610E-007) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728610E-007) differ by less than 3E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.514132e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.351156e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.544925e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.406951e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9435s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3668s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5751s for 8192 events => throughput is 1.42E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s + [COUNTERS] PROGRAM TOTAL : 0.9625s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3707s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5902s for 8192 events => throughput is 1.39E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0016s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.460459e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.435538e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.466853e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.436593e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8804s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3647s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5144s for 8192 events => throughput is 1.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9044s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3692s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5338s for 8192 events => throughput is 1.53E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.641494e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.541883e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.655223e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.588675e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0440s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3665s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6757s for 8192 events => throughput is 1.21E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0018s + [COUNTERS] PROGRAM TOTAL : 1.0751s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3693s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7040s for 8192 events => throughput is 1.16E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0019s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.221115e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.193272e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.225553e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.191231e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -334,58 +341,58 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8457s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8061s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0199s for 8192 events => throughput is 4.13E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0198s + [COUNTERS] PROGRAM TOTAL : 0.8448s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8136s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.56E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0187s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cuda (7.6381610362728578E-007) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381610362728578E-007) differ by less than 3E-14 (0.0) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.230611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.695448e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.541816e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.925847e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.854537e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.997799e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.229320e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.170285e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.859903e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.983419e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.225591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.128334e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.850975e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.982511e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.687847e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.328429e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 10c15cf9d1..1cc58a2dd1 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:58:07 +DATE: 2025-10-11_18:02:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.6755s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3603s - [COUNTERS] Fortran MEs ( 1 ) : 2.3152s for 8192 events => throughput is 3.54E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7018s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3625s + [COUNTERS] Fortran MEs ( 1 ) : 2.3393s for 8192 events => throughput is 3.50E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.6754s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3698s - [COUNTERS] Fortran MEs ( 1 ) : 2.3056s for 8192 events => throughput is 3.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7141s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3681s + [COUNTERS] Fortran MEs ( 1 ) : 2.3460s for 8192 events => throughput is 3.49E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381686438954397E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381686359952968E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.8067s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3659s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4358s for 8192 events => throughput is 3.36E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s + [COUNTERS] PROGRAM TOTAL : 2.7333s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3691s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3595s for 8192 events => throughput is 3.47E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381686438954397E-007) differ by less than 4E-4 (9.960018576560259e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686359952968E-007) differ by less than 4E-4 (9.949675585652074e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.485505e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.581994e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.473644e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.595398e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381671483253128E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0546s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3688s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6842s for 8192 events => throughput is 1.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0796s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3702s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7079s for 8192 events => throughput is 1.16E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381671483253128E-007) differ by less than 4E-4 (8.001994753481512e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381671483253128E-007) differ by less than 4E-4 (8.001994755701958e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.232148e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.209114e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.242719e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.211724e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.6626s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3670s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2947s for 8192 events => throughput is 2.78E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s + [COUNTERS] PROGRAM TOTAL : 0.6741s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3720s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3011s for 8192 events => throughput is 2.72E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.866680e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.778595e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.814611e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.785996e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.6345s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3672s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2664s for 8192 events => throughput is 3.07E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6455s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3705s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2742s for 8192 events => throughput is 2.99E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.183014e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.038472e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.199503e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.060001e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381686320975603E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.7045s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3656s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3379s for 8192 events => throughput is 2.42E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7218s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3694s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3514s for 8192 events => throughput is 2.33E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381686320975603E-007) differ by less than 4E-4 (9.944572607611946e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686320975603E-007) differ by less than 4E-4 (9.944572609832392e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.460974e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.367267e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.436294e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.356404e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381711031958629E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381615491789429E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8419s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8049s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0197s for 8192 events => throughput is 4.15E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0172s + [COUNTERS] PROGRAM TOTAL : 0.8351s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8093s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0182s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cuda (7.6381711031958629E-007) differ by less than 4E-4 (1.3179773188376487e-06) +OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381615491789429E-007) differ by less than 4E-4 (6.715046763083876e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.233915e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.138586e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.454452e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.179241e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.300238e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.224464e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.323216e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.249728e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.294935e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.225890e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.322990e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.250555e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.292471e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.220840e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.654983e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.651149e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index 9cff3d3d2c..2ca786964c 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx - +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone + +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:57:22 +DATE: 2025-10-11_18:01:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.6661s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3588s - [COUNTERS] Fortran MEs ( 1 ) : 2.3072s for 8192 events => throughput is 3.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3648s + [COUNTERS] Fortran MEs ( 1 ) : 2.3619s for 8192 events => throughput is 3.47E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.6664s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3632s - [COUNTERS] Fortran MEs ( 1 ) : 2.3031s for 8192 events => throughput is 3.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7387s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3721s + [COUNTERS] Fortran MEs ( 1 ) : 2.3666s for 8192 events => throughput is 3.46E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381608764955655E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.8757s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3651s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.5054s for 8192 events => throughput is 3.27E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.8711s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3762s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4897s for 8192 events => throughput is 3.29E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0052s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608764955655E-007) differ by less than 2E-4 (2.0918293319738268e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608764955655E-007) differ by less than 2E-4 (2.0918293208715966e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.427512e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.387716e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.426484e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.386658e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381608686521600E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.6394s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3662s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2706s for 8192 events => throughput is 6.45E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0027s + [COUNTERS] PROGRAM TOTAL : 1.6908s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3716s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3164s for 8192 events => throughput is 6.22E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608686521600E-007) differ by less than 2E-4 (2.1945164241365944e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608686521600E-007) differ by less than 2E-4 (2.1945164130343642e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.733385e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.591306e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.780255e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.584653e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9411s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3649s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5747s for 8192 events => throughput is 1.43E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s + [COUNTERS] PROGRAM TOTAL : 0.9663s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3722s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5924s for 8192 events => throughput is 1.38E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0016s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469379161117e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.446717e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.420848e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.473262e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.429579e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8685s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3656s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5015s for 8192 events => throughput is 1.63E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9022s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5284s for 8192 events => throughput is 1.55E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469379161117e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.681650e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.602337e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.668117e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.607376e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0574s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3699s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6857s for 8192 events => throughput is 1.19E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0018s + [COUNTERS] PROGRAM TOTAL : 1.0826s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7085s for 8192 events => throughput is 1.16E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0019s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469379161117e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.232369e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.176853e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.216790e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.176159e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610372590318E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608867927968E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8397s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8000s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0198s for 8192 events => throughput is 4.13E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0198s + [COUNTERS] PROGRAM TOTAL : 0.8465s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8152s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.53E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0188s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cuda (7.6381610372590318E-007) differ by less than 2E-4 (1.2911138824733825e-10) +OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381608867927968E-007) differ by less than 2E-4 (1.9570163600768353e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.219575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.668728e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.527801e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.889186e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.836972e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.020522e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.176072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.111985e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.835271e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.014502e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.206917e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.139379e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.823749e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.980651e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.671807e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.329147e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index f18eaf3551..869ed226f5 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:55:56 +DATE: 2025-10-11_17:59:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6925s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6838s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.43E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7024s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6938s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4263s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4177s - [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.62E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4256s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4169s + [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.46E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4276s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4188s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0084s for 8192 events => throughput is 9.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4378s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4280s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.69E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.916439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.191014e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.017065e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.282907e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4290s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4241s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0046s for 8192 events => throughput is 1.79E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4316s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4266s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 8192 events => throughput is 1.75E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.913729e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.860989e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.928329e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.909431e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4231s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4198s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.80E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4296s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4263s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.79E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.118646e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.006727e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.327279e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.109595e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4242s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4212s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.07E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4313s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4281s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.87E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.142389e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.041656e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.418661e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.245400e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4276s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4241s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.67E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4344s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4307s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.48E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.810680e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.847128e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.123505e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.978037e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449452343426109] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449452343426103] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8704s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8668s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.69E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.8657s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8616s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.19E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cuda (0.30449452343426109) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (0.30449452343426120) and cuda (0.30449452343426103) differ by less than 3E-14 (5.551115123125783e-16) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.094441e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.369013e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.576690e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.148244e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.540792e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.850459e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.885377e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.711716e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.486109e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.810975e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.914518e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.845473e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.512059e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.786901e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.224875e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.505596e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 9cee2ab297..290a3c86d1 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:56:23 +DATE: 2025-10-11_18:00:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6965s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6879s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6996s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6911s + [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.67E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4263s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4177s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.43E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4259s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4174s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.55E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446496609361] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4268s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4183s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 8192 events => throughput is 9.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4354s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4265s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.52E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446496609361) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.006620e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.988834e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.012762e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.001217e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446369440458] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4190s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4277s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.97E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446369440458) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.282555e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.265266e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.369793e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.237148e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4206s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4183s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.08E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4268s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.33E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446614968528) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.872977e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.015677e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.148892e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.231737e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4201s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4180s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4273s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4252s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.39E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446614968528) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.886846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.231045e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.506416e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.443837e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449447031649013] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4202s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4176s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.54E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4294s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4268s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.60E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449447031649013) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.376595e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.280248e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.863933e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.772169e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449447352014630] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449447192383194] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8576s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8540s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.64E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.8794s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8751s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0036s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cuda (0.30449447352014630) differ by less than 4E-4 (1.639245078566276e-07) +OK! xsec from fortran (0.30449452343426120) and cuda (0.30449447192383194) differ by less than 4E-4 (1.6916701384150912e-07) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.209039e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.023525e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.497762e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.499953e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.599688e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.571654e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.103544e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.545216e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.606706e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.440681e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.131283e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.320302e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.229812e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.015605e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.664371e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.300602e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 782fee34a5..54eb3e1a6f 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x - +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:56:09 +DATE: 2025-10-11_18:00:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6953s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6866s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6912s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6825s + [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.35E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4236s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s - [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4180s + [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.44E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4277s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4187s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.54E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4348s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4250s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.68E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.831908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.020488e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.918457e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.158136e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4287s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4239s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 8192 events => throughput is 1.83E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4307s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4256s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 8192 events => throughput is 1.75E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.892977e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.944164e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.974211e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.990329e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4216s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4183s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.84E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4315s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4283s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.89E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.237521e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.282930e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.477152e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.189855e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4270s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4239s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.00E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4314s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4283s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.02E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.311234e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.114512e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.507028e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.432567e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4247s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4212s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4300s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4264s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.53E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.926715e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.966860e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.198931e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.100849e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449452360186230] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453231638185] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8627s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8591s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.69E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.8660s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8619s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cuda (0.30449452360186230) differ by less than 2E-4 (5.504239286580059e-10) +OK! xsec from fortran (0.30449452343426120) and cuda (0.30449453231638185) differ by less than 2E-4 (2.917005059721589e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.206349e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.132456e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.536038e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.476431e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.506637e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.825751e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.900315e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.688447e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.486873e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.845505e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.921916e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.878507e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.466467e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.760833e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.235205e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.514420e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index bebebe43ae..79dba98821 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone - +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:55:13 +DATE: 2025-10-11_17:59:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8496s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8074s - [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8640s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8203s + [COUNTERS] Fortran MEs ( 1 ) : 0.0438s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4529s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4105s - [COUNTERS] Fortran MEs ( 1 ) : 0.0424s for 8192 events => throughput is 1.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4147s + [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846964] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4621s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4167s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0450s for 8192 events => throughput is 1.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4711s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4252s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0455s for 8192 events => throughput is 1.80E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846964) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846964) differ by less than 3E-14 (4.440892098500626e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.859940e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.837387e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.839978e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.822913e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4403s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4153s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4480s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4218s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0258s for 8192 events => throughput is 3.17E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846957) differ by less than 3E-14 (0.0) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846957) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.243144e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.267707e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.273347e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.222778e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4324s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4161s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4349s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4186s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.17E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.210364e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.198106e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.310117e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.028037e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4350s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4201s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0146s for 8192 events => throughput is 5.62E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4391s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4230s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0156s for 8192 events => throughput is 5.24E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.704117e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.463972e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.793092e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.474487e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4434s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4195s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4521s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4278s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0239s for 8192 events => throughput is 3.42E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.552376e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.505694e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.639783e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.538808e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -334,58 +341,58 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8650s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8612s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.66E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.8667s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8617s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cuda (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (44.641911695846943) and cuda (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.043338e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.923790e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.325784e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.174225e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.871559e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.777101e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.143094e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.655868e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.865333e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.765814e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.020534e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.993174e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.868423e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.751468e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.708181e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.413877e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 2a76a737ac..5dfa48ff39 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:55:42 +DATE: 2025-10-11_17:59:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8397s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7971s - [COUNTERS] Fortran MEs ( 1 ) : 0.0425s for 8192 events => throughput is 1.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8523s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8088s + [COUNTERS] Fortran MEs ( 1 ) : 0.0435s for 8192 events => throughput is 1.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4553s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4122s - [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4551s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4119s + [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641906072918047] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4624s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4193s + [COUNTERS] PROGRAM TOTAL : 0.4653s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4221s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641906072918047) differ by less than 4E-4 (1.2595627507661078e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641906072918047) differ by less than 4E-4 (1.2595627474354387e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.972969e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.918004e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.987350e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.936998e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641902189470080] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4356s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4182s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4377s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4199s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.66E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641902189470080) differ by less than 4E-4 (2.1294735186305758e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641902189470080) differ by less than 4E-4 (2.1294735152999067e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.748983e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.699516e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.695429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.722220e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,26 +208,26 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [UNWEIGHT] Wrote 1617 events (found 1622 events) [COUNTERS] PROGRAM TOTAL : 0.4310s [COUNTERS] Fortran Overhead ( 0 ) : 0.4214s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0093s for 8192 events => throughput is 8.83E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.72E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761755559866e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761733355405e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.169652e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.856695e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.239468e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.157334e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4221s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4132s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4281s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4187s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0091s for 8192 events => throughput is 8.96E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761755559866e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761733355405e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.627165e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.452792e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.935546e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.496015e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641906399820272] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4293s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4169s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4332s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4204s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0126s for 8192 events => throughput is 6.52E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641906399820272) differ by less than 4E-4 (1.1863351012664225e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641906399820272) differ by less than 4E-4 (1.1863350990459764e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.774461e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.751797e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.994273e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.843654e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641910992291372] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641911000118164] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8577s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8540s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.69E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.8690s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8644s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.06E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cuda (44.641910992291372) differ by less than 4E-4 (1.575997887748315e-08) +OK! xsec from fortran (44.641911695846943) and cuda (44.641911000118164) differ by less than 4E-4 (1.5584654677880394e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.201092e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.158414e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.452650e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.781779e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.883185e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.387147e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.341479e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.660863e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.843740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.340902e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.360831e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.882663e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.608054e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.999883e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.014740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.181537e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 449e459bdc..4c27cac81e 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:55:27 +DATE: 2025-10-11_17:59:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8469s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8046s - [COUNTERS] Fortran MEs ( 1 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8565s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8130s + [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4519s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4098s - [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4587s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4147s + [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4618s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0455s for 8192 events => throughput is 1.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4690s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4218s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0468s for 8192 events => throughput is 1.75E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641912938404218) differ by less than 2E-4 (2.783387209603916e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.833802e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.793421e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.834236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.799600e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4439s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4188s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 8192 events => throughput is 3.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4483s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4223s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 8192 events => throughput is 3.20E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641912938404218) differ by less than 2E-4 (2.783387209603916e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.367073e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.273502e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.340820e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.281864e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4302s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4144s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4382s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4219s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.17E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104058666648e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.283261e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.329657e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.353744e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.307405e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4297s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4151s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0143s for 8192 events => throughput is 5.75E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4397s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4242s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.42E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104058666648e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.825518e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.584798e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.928231e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.705746e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4383s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0220s for 8192 events => throughput is 3.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4435s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4205s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 8192 events => throughput is 3.61E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104058666648e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.615578e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.605692e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.732261e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.652839e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911674225568] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912949951454] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8598s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8560s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.69E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + [COUNTERS] PROGRAM TOTAL : 0.8669s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8620s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.17E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cuda (44.641911674225568) differ by less than 2E-4 (4.843293543999039e-10) +OK! xsec from fortran (44.641911695846943) and cuda (44.641912949951454) differ by less than 2E-4 (2.809253607516382e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.907482e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.727760e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.361691e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.049471e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.875077e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.736425e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.567905e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.634947e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.865156e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.745425e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.911973e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.997146e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.881287e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.718374e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.733673e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.415073e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/madX.sh b/epochX/cudacpp/tmad/madX.sh index 56fbce5d92..3c16230360 100755 --- a/epochX/cudacpp/tmad/madX.sh +++ b/epochX/cudacpp/tmad/madX.sh @@ -253,7 +253,7 @@ function getgridmax() elif [ "${ggttg}" == "1" ]; then echo 16384 32 # same total grid dimension as 2048 256 elif [ "${ggttgg}" == "1" ]; then - echo 16384 32 # same total grid dimension as 2048 256 + echo 512 32 # same total grid dimension as 64 256 (new sep2025: even 1024/32 aborts in max8thr mode) elif [ "${ggttggg}" == "1" ]; then echo 512 32 # same total grid dimension as 64 256 elif [ "${gguu}" == "1" ]; then @@ -478,9 +478,15 @@ function runmadevent() # PART 1 - build madevent ########################################################################## +echo MADGRAPH_CUDA_ARCHITECTURE=${MADGRAPH_CUDA_ARCHITECTURE} +echo MADGRAPH_HIP_ARCHITECTURE=${MADGRAPH_HIP_ARCHITECTURE} + unset GTEST_ROOT unset LOCALGTEST +export HASBLAS=hasBlas +echo HASBLAS=${HASBLAS} + for suff in $suffs; do dir=$(showdir) @@ -511,6 +517,12 @@ if [ "${maketype}" == "-makeonly" ]; then printf "\nMAKE COMPLETED\n"; exit 0; f # PART 2 - run madevent ########################################################################## +unset CUDACPP_RUNTIME_BLASCOLORSUM +printf "\nCUDACPP_RUNTIME_BLASCOLORSUM=$CUDACPP_RUNTIME_BLASCOLORSUM\n" + +unset CUDACPP_RUNTIME_CUBLASTF32TENSOR +printf "\nCUDACPP_RUNTIME_CUBLASTF32TENSOR=$CUDACPP_RUNTIME_CUBLASTF32TENSOR\n" + printf "\nOMP_NUM_THREADS=$OMP_NUM_THREADS\n" printf "\nDATE: $(date '+%Y-%m-%d_%H:%M:%S')\n\n" diff --git a/epochX/cudacpp/tmad/strip10x.sh b/epochX/cudacpp/tmad/strip10x.sh new file mode 100755 index 0000000000..571d134a64 --- /dev/null +++ b/epochX/cudacpp/tmad/strip10x.sh @@ -0,0 +1,11 @@ +#!/bin/sh +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +cd $(dirname $0) +for log in logs_*/log*.txt ; do + cat $log | awk 'BEGIN{ok=1}; /^\*\*\*/{if ($5=="x10") ok=0; else ok=1}; {if (ok==1) print $0}' > ${log}.new + mv ${log}.new ${log} +done diff --git a/epochX/cudacpp/tput/allTees.sh b/epochX/cudacpp/tput/allTees.sh index 69ef153764..8475b8fd1b 100755 --- a/epochX/cudacpp/tput/allTees.sh +++ b/epochX/cudacpp/tput/allTees.sh @@ -1,8 +1,8 @@ #!/bin/bash -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Apr 2022) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. scrdir=$(cd $(dirname $0); pwd) @@ -20,7 +20,7 @@ if [ "$(hostname)" == "itgold91.cern.ch" ]; then bblds=-cpponly; fi # Usage function usage() { - echo "Usage (1): $0 [-short] [-e] [-sa] [-makeonly] [-nomakeclean] [-hip|-nocuda|-cpponly] [-bsmonly|-nobsm]" + echo "Usage (1): $0 [-short] [-e] [-sa] [-makeonly] [-nomakeclean] [-hip|-nocuda|-cpponly] [-bsmonly|-nobsm|-scalingonly|-blasonly|-blasandscalingonly]" echo "Run tests and check all logs" echo "" echo "Usage (2): $0 -checkonly" @@ -32,7 +32,10 @@ function usage() checkonly=0 ggttggg=-ggttggg rndhst=-curhst -bsm= +sm=1 +bsm=1 +scaling=1 +blas=1 if [ "$1" == "-checkonly" ]; then # Check existing logs without running any tests? checkonly=1 @@ -73,11 +76,35 @@ while [ "${checkonly}" == "0" ] && [ "$1" != "" ]; do if [ "${bblds}" != "" ] && [ "${bblds}" != "$1" ]; then echo "ERROR! Incompatible option $1: backend builds are already defined as '$bblds'"; usage; fi bblds="$1" shift - elif [ "$1" == "-bsmonly" ] && [ "$bsm" != "-nobsm" ]; then - bsm=$1 + elif [ "$1" == "-bsmonly" ] && [ "${sm}${scaling}${bsm}${blas}" == "1111" ]; then + sm=0 + bsm=1 + scaling=0 + blas=0 shift - elif [ "$1" == "-nobsm" ] && [ "$bsm" != "-bsmonly" ]; then - bsm=$1 + elif [ "$1" == "-nobsm" ] && [ "${sm}${scaling}${bsm}${blas}" == "1111" ]; then + sm=1 + bsm=0 + scaling=1 + blas=1 + shift + elif [ "$1" == "-scalingonly" ] && [ "${sm}${scaling}${bsm}${blas}" == "1111" ]; then + sm=0 + bsm=0 + scaling=1 + blas=0 + shift + elif [ "$1" == "-blasonly" ] && [ "${blas}${scaling}${bsm}${blas}" == "1111" ]; then + sm=0 + bsm=0 + scaling=0 + blas=1 + shift + elif [ "$1" == "-blasandscalingonly" ] && [ "${blas}${scaling}${bsm}${blas}" == "1111" ]; then + sm=0 + bsm=0 + scaling=1 + blas=1 shift else usage @@ -88,11 +115,28 @@ done function checklogs() { cd $scrdir/.. - # Print out any errors in the logs - if ! egrep -i '(error|fault|failed)' ./tput/logs_* -r; then echo "No errors found in logs"; fi + # Print out any errors in the logs (exclude scaling logs) + if ! egrep -i '(error|fault|failed)' ./tput/logs_*/*.txt; then echo "No errors found in logs"; fi # Print out any FPEs or '{ }' in the logs echo if ! egrep '(^Floating Point Exception|{ })' tput/logs* -r; then echo "No FPEs or '{ }' found in logs"; fi + # Print out any aborts in the logs (exclude scaling logs) + echo + txt=$(grep Abort ./tput/logs_*/*.txt | sed "s|\:.*SubProcesses/P|: P|") + if [ "${txt}" == "" ]; then + echo "No aborts found in logs" + else + echo "${txt}" + fi + # Print out any asserts/aborts in scaling logs + echo + txt=$(egrep -i '(abort|assert)' ./tput/logs_*/*.scaling | sed "s|\:.*SubProcesses/P|: P|" | sort -u) + if [ "${txt}" == "" ]; then + echo "No aborts or asserts found in scaling logs" + else + echo "${txt}" + fi + # Print out the MEK channelid debugging output (except for '{ }') echo \grep MEK ${scrdir}/logs_*/* | sed "s|${scrdir}/logs_||" | grep -v '{ }' | sed 's|_mad.*DEBUG:||' | sort -u @@ -123,11 +167,11 @@ fi cd $scrdir/.. started="STARTED AT $(date)" -# (36/102) Six logs (double/mixed/float x hrd0/hrd1 x inl0) in each of the six SM processes +# (+36: 36/144) Six logs (double/mixed/float x hrd0/hrd1 x inl0) in each of the six SM processes [sm==1] \rm -rf gg_ttggg${suff}/lib/build.none_* cmd="./tput/teeThroughputX.sh -dmf -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg ${makeclean} ${opts}" tmp1=$(mktemp) -if [ "${bsm}" != "-bsmonly" ]; then +if [ "${sm}" == "1" ]; then $cmd; status=$? ls -ltr ee_mumu${suff}/lib/build.none_*_inl0_hrd* gg_tt${suff}/lib/build.none_*_inl0_hrd* gg_tt*g${suff}/lib/build.none_*_inl0_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp1 else @@ -135,86 +179,140 @@ else fi ended1="$cmd\nENDED(1) AT $(date) [Status=$status]" -# (48/102) Four extra logs (double/float x hrd0/hrd1 x inl1) only in three of the six SM processes +# (+18: 54/144) Three scaling logs (double/mixed/float x hrd0 x inl0) in each of the six SM processes [scaling==1] +if [ "${scaling}" == "1" ]; then + if [ "${sm}" == "1" ]; then + cmd="./tput/teeThroughputX.sh -dmf -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg -scaling ${opts}" # no rebuild needed + $cmd; status=$? + else + cmd="./tput/teeThroughputX.sh -dmf -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg -scaling ${makeclean} ${opts}" # this is the first build + $cmd; status=$? + fi +else + cmd="SKIP '$cmd'"; echo $cmd; status=$? +fi +ended1sc="$cmd\nENDED(1-scaling) AT $(date) [Status=$status]" + +# (+6: 60/144) Three extra logs (double/mixed/float x hrd0 x inl0 + blasOn) only in two of the six SM processes (rebuild may be needed) [blas==1] +if [ "${blas}" == "1" ]; then + if [ "${sm}" == "1" ] || [ "${scaling}" == "1" ]; then + cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -blasOn ${opts}" # no rebuild needed + $cmd; status=$? + else + cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -blasOn ${makeclean} ${opts}" # this is the first build + $cmd; status=$? + fi +else + cmd="SKIP '$cmd'"; echo $cmd; status=$? +fi +ended2="$cmd\nENDED(2) AT $(date) [Status=$status]" + +# (+12: 72/144) Three scaling logs (double/mixed/float x hrd0 x inl0 + blasOn) only in four of the six SM processes [blas==1 || scaling==1] +if [ "${blas}" == "1" ] || [ "${scaling}" == "1" ]; then + cmd="./tput/teeThroughputX.sh -ggtt -ggttg -ggttgg -ggttggg -dmf -blasOn -scaling ${opts}" # no rebuild needed + $cmd; status=$? +else + cmd="SKIP '$cmd'"; echo $cmd; status=$? +fi +ended2sc="$cmd\nENDED(2-scaling) AT $(date) [Status=$status]" + +# (+12: 84/144) Four extra logs (double/float x hrd0/hrd1 x inl1) only in three of the six SM processes [sm==1] \rm -rf gg_ttg${suff}/lib/build.none_* \rm -rf gg_ttggg${suff}/lib/build.none_* cmd="./tput/teeThroughputX.sh -d_f -hrd -makej -eemumu -ggtt -ggttgg -inlonly ${makeclean} ${opts}" -tmp2=$(mktemp) -if [ "${bsm}" != "-bsmonly" ]; then +tmp3=$(mktemp) +if [ "${sm}" == "1" ]; then $cmd; status=$? - ls -ltr ee_mumu${suff}/lib/build.none_*_inl1_hrd* gg_tt${suff}/lib/build.none_*_inl1_hrd* gg_tt*g${suff}/lib/build.none_*_inl1_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp2 + ls -ltr ee_mumu${suff}/lib/build.none_*_inl1_hrd* gg_tt${suff}/lib/build.none_*_inl1_hrd* gg_tt*g${suff}/lib/build.none_*_inl1_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp3 else cmd="SKIP '$cmd'"; echo $cmd; status=$? fi -ended2="$cmd\nENDED(2) AT $(date) [Status=$status]" +ended3="$cmd\nENDED(3) AT $(date) [Status=$status]" -# (60/102) Two extra logs (double/float x hrd0 x inl0 + bridge) in all six SM processes (rebuild from cache) +# (+12: 96/144) Two extra logs (double/float x hrd0 x inl0 + bridge) in all six SM processes (rebuild from cache) [sm==1] cmd="./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg $ggttggg -d_f -bridge ${makeclean} ${opts}" -if [ "${bsm}" != "-bsmonly" ]; then +if [ "${sm}" == "1" ]; then $cmd; status=$? else cmd="SKIP '$cmd'"; echo $cmd; status=$? fi -ended3="$cmd\nENDED(3) AT $(date) [Status=$status]" +ended4="$cmd\nENDED(4) AT $(date) [Status=$status]" -# (66/102) Two extra logs (double/float x hrd0 x inl0 + rmbhst) only in three of the six SM processes (no rebuild needed) +# (+6: 102/144) Two extra logs (double/float x hrd0 x inl0 + rmbhst) only in three of the six SM processes (no rebuild needed) [sm==1] cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f -rmbhst ${opts}" -if [ "${bsm}" != "-bsmonly" ]; then +if [ "${sm}" == "1" ]; then $cmd; status=$? else cmd="SKIP '$cmd'"; echo $cmd; status=$? fi -ended4="$cmd\nENDED(4) AT $(date) [Status=$status]" +ended5="$cmd\nENDED(5) AT $(date) [Status=$status]" -# (72/102) Two extra logs (double/float x hrd0 x inl0 + rndhst) only in three of the six SM processes (no rebuild needed) +# (+6: 108/144) Two extra logs (double/float x hrd0 x inl0 + rndhst) only in three of the six SM processes (no rebuild needed) [sm==1] cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f ${rndhst} ${opts}" -if [ "${bsm}" != "-bsmonly" ] && [ "${rndhst}" != "-common" ]; then +if [ "${sm}" == "1" ] && [ "${rndhst}" != "-common" ]; then $cmd; status=$? else cmd="SKIP '$cmd'"; echo $cmd; status=$? fi -ended5="$cmd\nENDED(5) AT $(date) [Status=$status]" +ended6="$cmd\nENDED(6) AT $(date) [Status=$status]" -# (78/102) Two extra logs (double/float x hrd0 x inl0 + common) only in three of the six SM processes (no rebuild needed) +# (+6: 114/144) Two extra logs (double/float x hrd0 x inl0 + common) only in three of the six SM processes (no rebuild needed) [sm==1] cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f -common ${opts}" -if [ "${bsm}" != "-bsmonly" ]; then +if [ "${sm}" == "1" ]; then $cmd; status=$? else cmd="SKIP '$cmd'"; echo $cmd; status=$? fi -ended6="$cmd\nENDED(6) AT $(date) [Status=$status]" +ended7="$cmd\nENDED(7) AT $(date) [Status=$status]" -# (102/102) Six extra logs (double/mixed/float x hrd0/hrd1 x inl0) only in the four BSM processes +# (+6: 120/144) Three extra logs (double/float x hrd0 x inl0 + noBlas) only in two of the six SM processes (rebuild is needed) [blas==1] +cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -noBlas ${makeclean} ${opts}" +if [ "${blas}" == "1" ]; then + $cmd; status=$? +else + cmd="SKIP '$cmd'"; echo $cmd; status=$? +fi +ended8="$cmd\nENDED(8) AT $(date) [Status=$status]" + +# (+24: 144/144) Six extra logs (double/mixed/float x hrd0/hrd1 x inl0) only in the four BSM processes [bsm==1] cmd="./tput/teeThroughputX.sh -dmf -hrd -makej -susyggtt -susyggt1t1 -smeftggtttt -heftggbb ${makeclean} ${opts}" -tmp3=$(mktemp) -if [ "${bsm}" != "-nobsm" ]; then +tmp9=$(mktemp) +if [ "${bsm}" == "1" ]; then $cmd; status=$? - ls -ltr susy_gg_tt${suff}/lib/build.none_*_inl0_hrd* susy_gg_t1t1${suff}/lib/build.none_*_inl0_hrd* smeft_gg_tttt${suff}/lib/build.none_*_inl0_hrd* heft_gg_bb${suff}/lib/build.none_*_inl0_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp2 + ls -ltr susy_gg_tt${suff}/lib/build.none_*_inl0_hrd* susy_gg_t1t1${suff}/lib/build.none_*_inl0_hrd* smeft_gg_tttt${suff}/lib/build.none_*_inl0_hrd* heft_gg_bb${suff}/lib/build.none_*_inl0_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp9 else cmd="SKIP '$cmd'"; echo $cmd; status=$? fi -ended7="$cmd\nENDED(7) AT $(date) [Status=$status]" +ended9="$cmd\nENDED(9) AT $(date) [Status=$status]" echo echo "Build(1):" cat $tmp1 echo -echo "Build(2):" -cat $tmp2 +echo "Build(3):" +cat $tmp3 +echo +echo "Build(9):" +cat $tmp9 echo echo -e "$started" echo -e "$ended1" +echo -e "$ended1sc" echo -e "$ended2" +echo -e "$ended2sc" echo -e "$ended3" echo -e "$ended4" echo -e "$ended5" echo -e "$ended6" echo -e "$ended7" +echo -e "$ended8" +echo -e "$ended9" if [ "$ggttggg" == "" ]; then echo echo "To complete the test for ggttggg type:" echo " ./tput/teeThroughputX.sh -dmf -hrd -makej -ggttggg ${makeclean} ${opts}" + echo " ./tput/teeThroughputX.sh -dmf -makej -ggttggg -scaling ${makeclean} ${opts}" echo " ./tput/teeThroughputX.sh -makej -ggttggg -d_f -bridge ${makeclean} ${opts}" fi diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling new file mode 100644 index 0000000000..1608b91cb1 --- /dev/null +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_15:39:36 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.365880e+06 1 256 +4.932658e+06 2 256 +1.130330e+07 4 256 +2.221065e+07 8 256 +3.796917e+07 16 256 +8.093742e+07 32 256 +1.438543e+08 64 256 +2.092652e+08 128 256 +2.586706e+08 256 256 +3.166572e+08 512 256 +3.450925e+08 1024 256 +### GPU: scaling test 32 +3.615411e+05 1 32 +7.956340e+05 2 32 +1.534533e+06 4 32 +2.896550e+06 8 32 +5.416499e+06 16 32 +1.086184e+07 32 32 +2.239377e+07 64 32 +4.040723e+07 128 32 +8.109125e+07 256 32 +1.501315e+08 512 32 +2.161406e+08 1024 32 +2.736516e+08 2048 32 +3.294400e+08 4096 32 +3.666924e+08 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.112163e+06 1 256 +1.095778e+06 2 256 +1.085622e+06 4 256 +### CPU: scaling test 32 +9.838283e+05 1 32 +1.009336e+06 2 32 +1.104848e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.791676e+06 1 256 +1.843126e+06 2 256 +1.850216e+06 4 256 +### CPU: scaling test 32 +1.835283e+06 1 32 +1.487162e+06 2 32 +1.478777e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.691677e+06 1 256 +2.725347e+06 2 256 +2.679688e+06 4 256 +### CPU: scaling test 32 +2.224230e+06 1 32 +2.558465e+06 2 32 +2.649774e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.781551e+06 1 256 +2.448941e+06 2 256 +2.756282e+06 4 256 +### CPU: scaling test 32 +2.377238e+06 1 32 +2.626719e+06 2 32 +2.722014e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.040101e+06 1 256 +2.059277e+06 2 256 +2.194331e+06 4 256 +### CPU: scaling test 32 +1.410251e+06 1 32 +1.626347e+06 2 32 +1.877466e+06 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 2396150f34..6b63860e97 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_08:54:52 +DATE: 2025-10-11_15:13:43 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.715157e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.495446e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.756115e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.456825e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.020579e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.872827e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.722104 sec -INFO: No Floating Point Exceptions have been reported - 2,722,047,064 cycles # 2.855 GHz - 4,240,638,296 instructions # 1.56 insn per cycle - 1.034081868 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.693291 sec + 2,729,119,040 cycles # 2.827 GHz + 4,039,185,150 instructions # 1.48 insn per cycle + 1.043410313 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.013288e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.182482e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.182482e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.019940e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.187870e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.187870e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.662912 sec -INFO: No Floating Point Exceptions have been reported - 19,208,633,801 cycles # 2.880 GHz - 46,193,026,925 instructions # 2.40 insn per cycle - 6.677929994 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.588033 sec + 19,038,044,386 cycles # 2.888 GHz + 46,485,585,356 instructions # 2.44 insn per cycle + 6.596061286 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.534189e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.004053e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.004053e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.557129e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.030035e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.030035e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.563122 sec -INFO: No Floating Point Exceptions have been reported - 13,135,626,695 cycles # 2.874 GHz - 31,728,680,952 instructions # 2.42 insn per cycle - 4.573724377 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.460811 sec + 12,939,620,485 cycles # 2.898 GHz + 31,810,901,247 instructions # 2.46 insn per cycle + 4.469139042 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.938790e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.711147e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.711147e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.933537e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.681631e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.681631e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.694529 sec -INFO: No Floating Point Exceptions have been reported - 10,256,024,954 cycles # 2.769 GHz - 19,694,743,800 instructions # 1.92 insn per cycle - 3.707450749 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1909) (512y: 0) (512z: 0) +TOTAL : 3.671840 sec + 10,104,892,452 cycles # 2.749 GHz + 19,727,697,375 instructions # 1.95 insn per cycle + 3.679095535 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.944800e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.743029e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.743029e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.989488e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.781185e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.781185e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.690433 sec -INFO: No Floating Point Exceptions have been reported - 10,133,821,420 cycles # 2.743 GHz - 19,357,887,145 instructions # 1.91 insn per cycle - 3.703105135 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 180) (512z: 0) +TOTAL : 3.576826 sec + 9,900,381,139 cycles # 2.765 GHz + 19,380,047,753 instructions # 1.96 insn per cycle + 3.585735108 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.663763e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.201339e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.201339e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.671348e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.193135e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.193135e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.240790 sec -INFO: No Floating Point Exceptions have been reported - 8,791,817,571 cycles # 2.072 GHz - 15,864,118,825 instructions # 1.80 insn per cycle - 4.252718180 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 871) (512y: 156) (512z: 1258) +TOTAL : 4.184170 sec + 8,626,596,296 cycles # 2.060 GHz + 15,802,085,882 instructions # 1.83 insn per cycle + 4.189889070 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 97960252e7..7af659d91e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,252 +10,216 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:36:32 +DATE: 2025-10-11_16:27:21 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.729675e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.983590e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.983590e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.684743e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.912007e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.912007e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.228883 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 7,241,513,211 cycles # 2.923 GHz - 12,978,693,777 instructions # 1.79 insn per cycle - 2.533005072 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +TOTAL : 2.246839 sec + 7,225,562,469 cycles # 2.863 GHz + 12,863,341,750 instructions # 1.78 insn per cycle + 2.580507454 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.954014e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154803e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.154803e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.838576e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.140129e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.140129e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.972350 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 20,384,148,235 cycles # 2.919 GHz - 46,410,615,309 instructions # 2.28 insn per cycle - 6.984536194 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.023062 sec + 20,241,810,963 cycles # 2.880 GHz + 46,692,050,581 instructions # 2.31 insn per cycle + 7.030271965 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.493408e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.921090e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.921090e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.470152e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.890657e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.890657e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.877492 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 14,402,886,877 cycles # 2.946 GHz - 32,567,021,239 instructions # 2.26 insn per cycle - 4.890045852 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.909808 sec + 14,179,876,666 cycles # 2.885 GHz + 32,595,242,292 instructions # 2.30 insn per cycle + 4.916954834 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.864025e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.539449e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.539449e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.819567e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.481129e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.481129e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.048395 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 11,503,225,226 cycles # 2.834 GHz - 21,048,377,803 instructions # 1.83 insn per cycle - 4.060868426 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1909) (512y: 0) (512z: 0) +TOTAL : 4.095092 sec + 11,322,720,907 cycles # 2.761 GHz + 21,029,920,385 instructions # 1.86 insn per cycle + 4.102381100 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.889652e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.596697e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.596697e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.870930e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.557290e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.557290e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.001389 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 11,334,121,636 cycles # 2.824 GHz - 20,717,870,984 instructions # 1.83 insn per cycle - 4.014529771 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 180) (512z: 0) +TOTAL : 3.995093 sec + 11,100,469,150 cycles # 2.774 GHz + 20,681,913,151 instructions # 1.86 insn per cycle + 4.002396442 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.585647e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.044820e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.044820e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.582678e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.044225e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.044225e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.655129 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 10,329,600,614 cycles # 2.214 GHz - 17,028,538,054 instructions # 1.65 insn per cycle - 4.667149794 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 871) (512y: 156) (512z: 1258) +TOTAL : 4.613845 sec + 9,931,301,323 cycles # 2.150 GHz + 16,893,944,858 instructions # 1.70 insn per cycle + 4.620613606 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index a07615eec8..26a3ddb0c7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:48:44 +DATE: 2025-10-11_16:42:49 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.479194e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.613891e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.774308e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.197440e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.038954e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.882278e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.350685 sec -INFO: No Floating Point Exceptions have been reported - 4,619,154,070 cycles # 2.910 GHz - 7,244,933,472 instructions # 1.57 insn per cycle - 1.645096659 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 1.377431 sec + 4,700,779,648 cycles # 2.862 GHz + 7,103,932,908 instructions # 1.51 insn per cycle + 1.699431401 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.031231e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.202853e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.202853e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.015955e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.183181e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.183181e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.892564 sec -INFO: No Floating Point Exceptions have been reported - 20,216,212,113 cycles # 2.933 GHz - 46,211,289,901 instructions # 2.29 insn per cycle - 6.898049528 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.982657 sec + 20,123,225,872 cycles # 2.880 GHz + 46,589,016,073 instructions # 2.32 insn per cycle + 6.988225439 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.575355e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.054940e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.054940e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.538846e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.003610e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.003610e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.814476 sec -INFO: No Floating Point Exceptions have been reported - 14,161,512,947 cycles # 2.938 GHz - 31,718,115,030 instructions # 2.24 insn per cycle - 4.820285845 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.882603 sec + 14,026,556,551 cycles # 2.870 GHz + 31,813,873,682 instructions # 2.27 insn per cycle + 4.888198902 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.990481e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.780031e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.780031e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.898151e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.633048e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.633048e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.993697 sec -INFO: No Floating Point Exceptions have been reported - 11,344,220,574 cycles # 2.837 GHz - 19,628,934,109 instructions # 1.73 insn per cycle - 3.999571252 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1909) (512y: 0) (512z: 0) +TOTAL : 4.110798 sec + 11,260,535,150 cycles # 2.739 GHz + 19,633,224,823 instructions # 1.74 insn per cycle + 4.116583823 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.024448e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.841239e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.841239e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.970956e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.746513e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.746513e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.944999 sec -INFO: No Floating Point Exceptions have been reported - 11,153,243,188 cycles # 2.824 GHz - 19,098,861,484 instructions # 1.71 insn per cycle - 3.950731996 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 180) (512z: 0) +TOTAL : 3.988212 sec + 10,998,193,863 cycles # 2.755 GHz + 19,082,144,667 instructions # 1.74 insn per cycle + 3.993745104 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.731970e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.289397e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.289397e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.672146e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.193639e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.193639e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.507668 sec -INFO: No Floating Point Exceptions have been reported - 9,996,448,485 cycles # 2.215 GHz - 15,693,646,767 instructions # 1.57 insn per cycle - 4.513790217 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 871) (512y: 156) (512z: 1258) +TOTAL : 4.562173 sec + 9,723,899,863 cycles # 2.130 GHz + 15,503,539,741 instructions # 1.59 insn per cycle + 4.567607097 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index cf4e1a1e41..6fb7bec229 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:45:58 +DATE: 2025-10-11_16:39:22 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.516686e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.553796e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.802555e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.211048e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.057687e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.886821e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.994565 sec -INFO: No Floating Point Exceptions have been reported - 3,557,200,491 cycles # 2.898 GHz - 7,056,373,361 instructions # 1.98 insn per cycle - 1.285636058 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 1.007194 sec + 3,630,386,848 cycles # 2.852 GHz + 7,085,182,200 instructions # 1.95 insn per cycle + 1.329367848 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.036397e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.208868e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.208868e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.609025e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.108811e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108811e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.480716 sec -INFO: No Floating Point Exceptions have been reported - 19,050,518,676 cycles # 2.938 GHz - 46,087,808,907 instructions # 2.42 insn per cycle - 6.486425223 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.966326 sec + 20,072,455,939 cycles # 2.880 GHz + 46,487,974,788 instructions # 2.32 insn per cycle + 6.971901471 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.562645e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.044042e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.044042e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.534636e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.011512e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.011512e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.441615 sec -INFO: No Floating Point Exceptions have been reported - 13,100,732,544 cycles # 2.946 GHz - 31,624,731,275 instructions # 2.41 insn per cycle - 4.447190414 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.522016 sec + 13,022,549,779 cycles # 2.877 GHz + 31,812,825,471 instructions # 2.44 insn per cycle + 4.527552219 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.962342e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.741135e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.741135e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.935285e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.687999e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.687999e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.617520 sec -INFO: No Floating Point Exceptions have been reported - 10,105,971,200 cycles # 2.790 GHz - 19,587,417,861 instructions # 1.94 insn per cycle - 3.623303854 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1909) (512y: 0) (512z: 0) +TOTAL : 3.667443 sec + 10,100,998,652 cycles # 2.751 GHz + 19,728,236,183 instructions # 1.95 insn per cycle + 3.673057057 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.035108e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.854302e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.854302e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.992051e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.787343e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.787343e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.498153 sec -INFO: No Floating Point Exceptions have been reported - 9,879,352,969 cycles # 2.820 GHz - 19,249,039,766 instructions # 1.95 insn per cycle - 3.504047287 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 180) (512z: 0) +TOTAL : 3.571290 sec + 9,885,962,165 cycles # 2.765 GHz + 19,369,829,317 instructions # 1.96 insn per cycle + 3.576876880 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.738426e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.300548e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.300548e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.693244e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.231997e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.231997e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.029997 sec -INFO: No Floating Point Exceptions have been reported - 8,617,786,478 cycles # 2.136 GHz - 15,755,373,979 instructions # 1.83 insn per cycle - 4.035885525 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 871) (512y: 156) (512z: 1258) +TOTAL : 4.132357 sec + 8,622,523,625 cycles # 2.084 GHz + 15,800,710,236 instructions # 1.83 insn per cycle + 4.137999929 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 23a95e9b43..93b11c3b79 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,235 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:43:10 +DATE: 2025-10-11_16:35:54 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.035607e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.566958e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.715605e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.941086e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.084749e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.895980e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.900996 sec -INFO: No Floating Point Exceptions have been reported - 6,141,367,935 cycles # 2.877 GHz - 11,470,611,621 instructions # 1.87 insn per cycle - 2.190401749 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +TOTAL : 1.918291 sec + 6,252,733,621 cycles # 2.863 GHz + 11,379,391,021 instructions # 1.82 insn per cycle + 2.240220236 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.040250e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.212161e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.212161e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.013186e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.180354e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.180354e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.458941 sec -INFO: No Floating Point Exceptions have been reported - 19,062,791,283 cycles # 2.949 GHz - 46,091,693,422 instructions # 2.42 insn per cycle - 6.464859061 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.629592 sec + 19,062,117,259 cycles # 2.874 GHz + 46,484,682,805 instructions # 2.44 insn per cycle + 6.635147352 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.576646e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.057103e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.057103e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.545386e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.014583e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.014583e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.406292 sec -INFO: No Floating Point Exceptions have been reported - 12,965,800,121 cycles # 2.939 GHz - 31,623,980,844 instructions # 2.44 insn per cycle - 4.412202935 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.493129 sec + 12,958,309,518 cycles # 2.881 GHz + 31,813,104,162 instructions # 2.46 insn per cycle + 4.498775995 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.982815e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.782156e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.782156e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.912965e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.656557e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.656557e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.583602 sec -INFO: No Floating Point Exceptions have been reported - 10,107,254,042 cycles # 2.816 GHz - 19,587,412,579 instructions # 1.94 insn per cycle - 3.589639966 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1909) (512y: 0) (512z: 0) +TOTAL : 3.707178 sec + 10,138,189,210 cycles # 2.732 GHz + 19,728,296,128 instructions # 1.95 insn per cycle + 3.712878607 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.036151e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.856576e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.856576e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.985253e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.770354e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.770354e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.497964 sec -INFO: No Floating Point Exceptions have been reported - 9,879,922,849 cycles # 2.820 GHz - 19,260,007,955 instructions # 1.95 insn per cycle - 3.503929332 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 180) (512z: 0) +TOTAL : 3.582064 sec + 9,886,774,092 cycles # 2.757 GHz + 19,370,169,431 instructions # 1.96 insn per cycle + 3.587619730 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.741980e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.303561e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.303561e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.686193e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.230105e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.230105e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.020543 sec -INFO: No Floating Point Exceptions have been reported - 8,613,807,526 cycles # 2.140 GHz - 15,755,294,312 instructions # 1.83 insn per cycle - 4.026429840 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 871) (512y: 156) (512z: 1258) +TOTAL : 4.149789 sec + 8,677,655,368 cycles # 2.089 GHz + 15,800,773,198 instructions # 1.82 insn per cycle + 4.155474285 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 25ac5b33ed..0a4631bfc6 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_08:55:23 +DATE: 2025-10-11_15:14:20 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.275982e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.504846e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.746692e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.305792e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.022345e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.904091e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.698739 sec -INFO: No Floating Point Exceptions have been reported - 2,671,543,996 cycles # 2.868 GHz - 4,201,680,962 instructions # 1.57 insn per cycle - 1.042000131 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.693566 sec + 2,710,557,615 cycles # 2.827 GHz + 4,083,363,883 instructions # 1.51 insn per cycle + 1.021549892 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.030289e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.210430e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.210430e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.017450e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184170e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.184170e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.580678 sec -INFO: No Floating Point Exceptions have been reported - 19,388,414,039 cycles # 2.942 GHz - 46,168,116,276 instructions # 2.38 insn per cycle - 6.592554583 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 452) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.603628 sec + 19,045,137,786 cycles # 2.882 GHz + 46,458,572,507 instructions # 2.44 insn per cycle + 6.609045751 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.571872e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.069657e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.069657e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.561588e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.042161e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.042161e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.454497 sec -INFO: No Floating Point Exceptions have been reported - 13,123,917,893 cycles # 2.941 GHz - 31,665,954,915 instructions # 2.41 insn per cycle - 4.468095413 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1648) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.447754 sec + 12,946,444,589 cycles # 2.908 GHz + 31,786,052,376 instructions # 2.46 insn per cycle + 4.453579330 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1659) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.982748e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.777393e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.777393e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.943406e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.706594e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.706594e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.618164 sec -INFO: No Floating Point Exceptions have been reported - 10,210,665,805 cycles # 2.814 GHz - 19,682,748,403 instructions # 1.93 insn per cycle - 3.629801888 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1894) (512y: 0) (512z: 0) +TOTAL : 3.652290 sec + 10,144,241,352 cycles # 2.774 GHz + 19,717,545,087 instructions # 1.94 insn per cycle + 3.657857806 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.010638e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.831487e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.831487e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.997101e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.794298e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.794298e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.575425 sec -INFO: No Floating Point Exceptions have been reported - 10,055,677,244 cycles # 2.805 GHz - 19,379,411,405 instructions # 1.93 insn per cycle - 3.588891240 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1636) (512y: 178) (512z: 0) +TOTAL : 3.563735 sec + 9,854,038,944 cycles # 2.762 GHz + 19,385,201,008 instructions # 1.97 insn per cycle + 3.569441170 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1640) (512y: 180) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.768631e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.372427e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.372427e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.736214e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.301251e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.301251e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.003265 sec -INFO: No Floating Point Exceptions have been reported - 8,643,505,927 cycles # 2.154 GHz - 15,697,303,734 instructions # 1.82 insn per cycle - 4.017112338 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 833) (512y: 153) (512z: 1240) +TOTAL : 4.039858 sec + 8,445,670,568 cycles # 2.088 GHz + 15,663,059,460 instructions # 1.85 insn per cycle + 4.045505615 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 845) (512y: 154) (512z: 1244) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 9d9181639f..9b568d27dc 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:26:55 +DATE: 2025-10-11_16:16:29 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.029061e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.569612e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.860356e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.176996e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.012495e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.891048e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.695056 sec -INFO: No Floating Point Exceptions have been reported - 2,704,879,803 cycles # 2.897 GHz - 4,231,460,596 instructions # 1.56 insn per cycle - 0.994220648 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.694489 sec + 2,721,882,133 cycles # 2.827 GHz + 4,075,193,578 instructions # 1.50 insn per cycle + 1.025946647 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.606609e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.069672e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.069672e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.542747e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.967302e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.967302e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.367451 sec -INFO: No Floating Point Exceptions have been reported - 12,912,062,009 cycles # 2.950 GHz - 32,678,927,799 instructions # 2.53 insn per cycle - 4.379017229 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 281) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.494551 sec + 12,989,678,815 cycles # 2.889 GHz + 32,646,175,174 instructions # 2.51 insn per cycle + 4.499744847 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 274) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.977635e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.819919e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.819919e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.896999e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.655930e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.655930e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.639596 sec -INFO: No Floating Point Exceptions have been reported - 10,716,876,159 cycles # 2.936 GHz - 25,005,426,831 instructions # 2.33 insn per cycle - 3.651343591 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1246) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.740364 sec + 10,735,813,544 cycles # 2.867 GHz + 24,899,817,001 instructions # 2.32 insn per cycle + 3.745821170 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1252) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.209379e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.259757e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.259757e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.183902e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.196051e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.196051e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.300079 sec -INFO: No Floating Point Exceptions have been reported - 9,398,178,742 cycles # 2.838 GHz - 16,938,114,674 instructions # 1.80 insn per cycle - 3.311853262 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1599) (512y: 0) (512z: 0) +TOTAL : 3.294762 sec + 9,147,621,247 cycles # 2.773 GHz + 16,945,065,636 instructions # 1.85 insn per cycle + 3.300349072 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1609) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.277311e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.397001e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.397001e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.267329e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.347814e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.347814e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.212263 sec -INFO: No Floating Point Exceptions have been reported - 9,139,009,296 cycles # 2.835 GHz - 16,502,297,129 instructions # 1.81 insn per cycle - 3.223908096 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1355) (512y: 139) (512z: 0) +TOTAL : 3.186397 sec + 8,854,475,202 cycles # 2.775 GHz + 16,456,181,779 instructions # 1.86 insn per cycle + 3.191297678 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1359) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.921368e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.661482e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.661482e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.906352e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.613901e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.613901e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.727052 sec -INFO: No Floating Point Exceptions have been reported - 8,146,634,535 cycles # 2.180 GHz - 14,661,732,896 instructions # 1.80 insn per cycle - 3.738643291 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1003) (512y: 158) (512z: 946) +TOTAL : 3.717092 sec + 7,920,630,909 cycles # 2.128 GHz + 14,619,990,772 instructions # 1.85 insn per cycle + 3.722531495 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1004) (512y: 158) (512z: 960) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index abe54e8953..e2fad0413c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:27:22 +DATE: 2025-10-11_16:16:58 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.921706e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.715910e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.877358e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.326337e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.070850e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.905795e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.678632 sec -INFO: No Floating Point Exceptions have been reported - 2,636,898,249 cycles # 2.884 GHz - 4,067,260,892 instructions # 1.54 insn per cycle - 0.973352356 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.687566 sec + 2,696,565,159 cycles # 2.829 GHz + 4,062,904,580 instructions # 1.51 insn per cycle + 1.010928380 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.084164e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.941928e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.941928e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.043775e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.849543e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.849543e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.469758 sec -INFO: No Floating Point Exceptions have been reported - 10,217,900,291 cycles # 2.936 GHz - 25,614,437,724 instructions # 2.51 insn per cycle - 3.480862891 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 236) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.494605 sec + 10,083,396,787 cycles # 2.882 GHz + 25,760,449,217 instructions # 2.55 insn per cycle + 3.499888853 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 246) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.313032e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.558172e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.558172e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.297652e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.517332e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.517332e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.173341 sec -INFO: No Floating Point Exceptions have been reported - 9,354,473,123 cycles # 2.939 GHz - 21,650,720,885 instructions # 2.31 insn per cycle - 3.184272296 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1112) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.161432 sec + 9,089,198,091 cycles # 2.871 GHz + 21,827,149,693 instructions # 2.40 insn per cycle + 3.166784889 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1116) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.358550e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.604458e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.604458e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.295786e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.454015e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.454015e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.115986 sec -INFO: No Floating Point Exceptions have been reported - 8,850,186,465 cycles # 2.831 GHz - 16,062,849,181 instructions # 1.81 insn per cycle - 3.126797345 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) +TOTAL : 3.158774 sec + 8,695,257,664 cycles # 2.749 GHz + 15,965,615,823 instructions # 1.84 insn per cycle + 3.164128836 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1484) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.422935e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.724037e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.724037e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.398085e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.643924e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.643924e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.043301 sec -INFO: No Floating Point Exceptions have been reported - 8,651,791,606 cycles # 2.834 GHz - 15,666,461,627 instructions # 1.81 insn per cycle - 3.054177777 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1264) (512y: 141) (512z: 0) +TOTAL : 3.034628 sec + 8,440,163,243 cycles # 2.777 GHz + 15,795,186,827 instructions # 1.87 insn per cycle + 3.039990401 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1288) (512y: 141) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.052275e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.908416e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.908416e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.002688e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.799181e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.799181e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.514529 sec -INFO: No Floating Point Exceptions have been reported - 7,791,531,975 cycles # 2.211 GHz - 14,393,714,103 instructions # 1.85 insn per cycle - 3.525649878 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1031) (512y: 164) (512z: 876) +TOTAL : 3.557099 sec + 7,607,771,698 cycles # 2.137 GHz + 14,233,174,966 instructions # 1.87 insn per cycle + 3.562310738 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 992) (512y: 158) (512z: 880) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling new file mode 100644 index 0000000000..a78c1b2deb --- /dev/null +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_15:40:18 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.981251e+06 1 256 +6.047935e+06 2 256 +1.122832e+07 4 256 +2.252678e+07 8 256 +4.235605e+07 16 256 +8.416122e+07 32 256 +1.466169e+08 64 256 +3.049065e+08 128 256 +4.651176e+08 256 256 +6.085927e+08 512 256 +7.481343e+08 1024 256 +### GPU: scaling test 32 +4.108938e+05 1 32 +7.731896e+05 2 32 +1.472652e+06 4 32 +3.058688e+06 8 32 +4.923029e+06 16 32 +1.154805e+07 32 32 +2.237762e+07 64 32 +4.518229e+07 128 32 +7.698959e+07 256 32 +1.503754e+08 512 32 +2.942634e+08 1024 32 +4.027161e+08 2048 32 +5.199929e+08 4096 32 +5.853205e+08 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.083777e+06 1 256 +1.126195e+06 2 256 +1.126272e+06 4 256 +### CPU: scaling test 32 +1.086034e+06 1 32 +1.116071e+06 2 32 +1.128798e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.853894e+06 1 256 +3.152865e+06 2 256 +3.025871e+06 4 256 +### CPU: scaling test 32 +2.851034e+06 1 32 +2.925313e+06 2 32 +2.581790e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.276087e+06 1 256 +3.611916e+06 2 256 +3.183634e+06 4 256 +### CPU: scaling test 32 +3.073082e+06 1 32 +3.375349e+06 2 32 +2.927052e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.662480e+06 1 256 +3.408266e+06 2 256 +3.661694e+06 4 256 +### CPU: scaling test 32 +1.789109e+06 1 32 +3.449949e+06 2 32 +3.560402e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.254224e+06 1 256 +3.401880e+06 2 256 +3.536803e+06 4 256 +### CPU: scaling test 32 +1.684033e+06 1 32 +2.687382e+06 2 32 +2.916448e+06 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index fa697401ba..9dacd0443a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_08:56:56 +DATE: 2025-10-11_15:16:08 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.318402e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.547340e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.573294e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.223637e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.675161e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.645637e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.580539 sec -INFO: No Floating Point Exceptions have been reported - 2,318,735,379 cycles # 2.865 GHz - 3,612,120,055 instructions # 1.56 insn per cycle - 0.879357898 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 109 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.588199 sec + 2,408,587,167 cycles # 2.842 GHz + 3,683,823,828 instructions # 1.53 insn per cycle + 0.903961148 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.072197e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.275533e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275533e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.035251e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.217456e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.217456e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.274259 sec -INFO: No Floating Point Exceptions have been reported - 18,464,131,410 cycles # 2.940 GHz - 45,058,020,075 instructions # 2.44 insn per cycle - 6.281329583 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 411) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.454566 sec + 18,664,660,450 cycles # 2.890 GHz + 45,251,843,843 instructions # 2.42 insn per cycle + 6.459911913 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.257463e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.446957e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.446957e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.213678e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.366853e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.366853e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.180369 sec -INFO: No Floating Point Exceptions have been reported - 9,372,467,471 cycles # 2.941 GHz - 22,319,965,268 instructions # 2.38 insn per cycle - 3.189536232 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.221547 sec + 9,347,928,391 cycles # 2.898 GHz + 22,375,063,737 instructions # 2.39 insn per cycle + 3.226933374 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.408379e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.710073e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.710073e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.361341e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.581474e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.581474e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.001033 sec -INFO: No Floating Point Exceptions have been reported - 8,493,792,111 cycles # 2.825 GHz - 15,797,222,111 instructions # 1.86 insn per cycle - 3.010052254 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2565) (512y: 0) (512z: 0) +TOTAL : 3.041655 sec + 8,385,705,935 cycles # 2.753 GHz + 15,815,253,481 instructions # 1.89 insn per cycle + 3.046966557 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.426130e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.768067e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.768067e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.426573e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.714317e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.714317e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.984448 sec -INFO: No Floating Point Exceptions have been reported - 8,427,466,763 cycles # 2.818 GHz - 15,640,000,146 instructions # 1.86 insn per cycle - 2.993491493 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2462) (512y: 12) (512z: 0) +TOTAL : 2.970277 sec + 8,276,306,484 cycles # 2.782 GHz + 15,653,687,115 instructions # 1.89 insn per cycle + 2.975610452 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.427110e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.709739e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.709739e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.392250e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.619370e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.619370e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.984139 sec -INFO: No Floating Point Exceptions have been reported - 6,725,622,216 cycles # 2.249 GHz - 12,910,486,373 instructions # 1.92 insn per cycle - 2.994013668 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1698) (512y: 16) (512z: 1440) +TOTAL : 3.010134 sec + 6,663,148,382 cycles # 2.210 GHz + 12,894,118,429 instructions # 1.94 insn per cycle + 3.015621591 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 9136826931..215370ad38 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,252 +10,216 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:37:07 +DATE: 2025-10-11_16:28:03 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.256593e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.121486e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.121486e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.220206e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.249013e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.249013e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.691319 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 5,610,006,933 cycles # 2.911 GHz - 10,218,919,767 instructions # 1.82 insn per cycle - 1.984436466 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +TOTAL : 1.704287 sec + 5,590,644,626 cycles # 2.843 GHz + 10,005,372,723 instructions # 1.79 insn per cycle + 2.022727811 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 109 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.060836e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.248384e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.248384e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.010617e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.186955e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.186955e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.418392 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 18,916,088,440 cycles # 2.945 GHz - 45,156,650,630 instructions # 2.39 insn per cycle - 6.425565221 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 411) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.713335 sec + 19,329,941,883 cycles # 2.877 GHz + 45,365,505,516 instructions # 2.35 insn per cycle + 6.720261817 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.163234e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.223206e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.223206e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.128665e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.170237e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.170237e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.414716 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 10,073,193,872 cycles # 2.945 GHz - 23,610,645,909 instructions # 2.34 insn per cycle - 3.421707000 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.459266 sec + 10,015,354,665 cycles # 2.890 GHz + 23,673,664,836 instructions # 2.36 insn per cycle + 3.466212345 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.302389e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.467769e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.467769e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.263697e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.371457e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.371457e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.241454 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 9,215,037,610 cycles # 2.837 GHz - 16,874,646,512 instructions # 1.83 insn per cycle - 3.248598680 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2565) (512y: 0) (512z: 0) +TOTAL : 3.286775 sec + 9,106,177,679 cycles # 2.766 GHz + 16,899,675,653 instructions # 1.86 insn per cycle + 3.293662887 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.316990e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.533576e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.533576e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.302738e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.462511e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.462511e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.224710 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 9,166,209,661 cycles # 2.837 GHz - 16,710,284,997 instructions # 1.82 insn per cycle - 3.231713030 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2462) (512y: 12) (512z: 0) +TOTAL : 3.240690 sec + 8,985,254,061 cycles # 2.768 GHz + 16,737,997,718 instructions # 1.86 insn per cycle + 3.247472027 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.333210e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.469405e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.469405e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.254993e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.321155e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.321155e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.205451 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 7,432,634,328 cycles # 2.315 GHz - 14,074,642,515 instructions # 1.89 insn per cycle - 3.212353581 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1698) (512y: 16) (512z: 1440) +TOTAL : 3.302457 sec + 7,458,897,279 cycles # 2.255 GHz + 14,069,459,173 instructions # 1.89 insn per cycle + 3.309041869 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 3c8228d85b..c35f97f2b8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:49:18 +DATE: 2025-10-11_16:43:25 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.233592e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.244967e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.184868e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.253381e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.370790e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.518342e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.220966 sec -INFO: No Floating Point Exceptions have been reported - 4,183,681,416 cycles # 2.867 GHz - 6,662,508,205 instructions # 1.59 insn per cycle - 1.516447212 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 109 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 1.218481 sec + 4,207,892,724 cycles # 2.859 GHz + 6,617,854,340 instructions # 1.57 insn per cycle + 1.530363886 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.080178e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.275874e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275874e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.036512e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.218588e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.218588e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.537773 sec -INFO: No Floating Point Exceptions have been reported - 19,269,764,932 cycles # 2.946 GHz - 45,190,617,795 instructions # 2.35 insn per cycle - 6.543013626 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 411) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.791690 sec + 19,679,660,217 cycles # 2.896 GHz + 45,434,399,439 instructions # 2.31 insn per cycle + 6.797219573 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.263942e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.453881e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.453881e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.200562e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.338496e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.338496e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.487545 sec -INFO: No Floating Point Exceptions have been reported - 10,298,424,695 cycles # 2.949 GHz - 22,355,388,978 instructions # 2.17 insn per cycle - 3.493059791 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.583516 sec + 10,308,901,515 cycles # 2.874 GHz + 22,457,815,111 instructions # 2.18 insn per cycle + 3.588832664 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.406924e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.701531e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.701531e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.344557e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.579879e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.579879e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.322767 sec -INFO: No Floating Point Exceptions have been reported - 9,443,809,325 cycles # 2.838 GHz - 15,664,102,195 instructions # 1.66 insn per cycle - 3.328357008 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2565) (512y: 0) (512z: 0) +TOTAL : 3.404488 sec + 9,434,839,609 cycles # 2.768 GHz + 15,726,735,545 instructions # 1.67 insn per cycle + 3.409840593 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.446360e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.803645e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.803645e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.407789e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.709415e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.709415e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.287328 sec -INFO: No Floating Point Exceptions have been reported - 9,371,124,961 cycles # 2.847 GHz - 15,299,944,141 instructions # 1.63 insn per cycle - 3.292839828 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2462) (512y: 12) (512z: 0) +TOTAL : 3.341843 sec + 9,335,373,029 cycles # 2.790 GHz + 15,365,478,048 instructions # 1.65 insn per cycle + 3.347112669 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.466708e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.777222e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.777222e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.374032e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.592267e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.592267e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.269312 sec -INFO: No Floating Point Exceptions have been reported - 7,659,274,117 cycles # 2.340 GHz - 12,573,895,764 instructions # 1.64 insn per cycle - 3.274843213 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1698) (512y: 16) (512z: 1440) +TOTAL : 3.383460 sec + 7,651,857,041 cycles # 2.259 GHz + 12,604,317,732 instructions # 1.65 insn per cycle + 3.388617759 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 7f30dafdfd..4fe47b6309 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:46:29 +DATE: 2025-10-11_16:39:57 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.282321e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.333955e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.369324e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.232997e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.388992e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.560013e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.867789 sec -INFO: No Floating Point Exceptions have been reported - 3,167,199,789 cycles # 2.899 GHz - 6,506,216,930 instructions # 2.05 insn per cycle - 1.149942283 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 109 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.882532 sec + 3,214,322,203 cycles # 2.828 GHz + 6,452,752,496 instructions # 2.01 insn per cycle + 1.194579493 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.085219e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281583e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281583e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.031419e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.212428e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.212428e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.166056 sec -INFO: No Floating Point Exceptions have been reported - 18,234,644,828 cycles # 2.955 GHz - 45,008,398,832 instructions # 2.47 insn per cycle - 6.171760600 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 411) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.477368 sec + 18,661,812,568 cycles # 2.879 GHz + 45,252,341,321 instructions # 2.42 insn per cycle + 6.482693144 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.256894e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.462086e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.462086e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.196497e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.342466e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.342466e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.159870 sec -INFO: No Floating Point Exceptions have been reported - 9,347,982,513 cycles # 2.954 GHz - 22,275,896,372 instructions # 2.38 insn per cycle - 3.165402193 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.247962 sec + 9,353,957,329 cycles # 2.876 GHz + 22,375,680,082 instructions # 2.39 insn per cycle + 3.253308897 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.410366e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.712636e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.712636e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.352259e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.566980e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.566980e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.983322 sec -INFO: No Floating Point Exceptions have been reported - 8,463,194,185 cycles # 2.833 GHz - 15,755,395,679 instructions # 1.86 insn per cycle - 2.988746216 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2565) (512y: 0) (512z: 0) +TOTAL : 3.051523 sec + 8,419,136,103 cycles # 2.756 GHz + 15,815,678,204 instructions # 1.88 insn per cycle + 3.056921587 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.454105e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.801490e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.801490e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.409169e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.699321e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.699321e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.933599 sec -INFO: No Floating Point Exceptions have been reported - 8,319,397,972 cycles # 2.832 GHz - 15,593,973,322 instructions # 1.87 insn per cycle - 2.939101584 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2462) (512y: 12) (512z: 0) +TOTAL : 2.991131 sec + 8,296,340,422 cycles # 2.770 GHz + 15,649,217,834 instructions # 1.89 insn per cycle + 2.996375115 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.469652e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.768397e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.768397e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.362594e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.567971e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.567971e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.922384 sec -INFO: No Floating Point Exceptions have been reported - 6,636,368,959 cycles # 2.267 GHz - 12,865,256,567 instructions # 1.94 insn per cycle - 2.927905791 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1698) (512y: 16) (512z: 1440) +TOTAL : 3.046737 sec + 6,657,108,236 cycles # 2.182 GHz + 12,894,608,228 instructions # 1.94 insn per cycle + 3.052164277 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index e2ecb9b5fd..a89730724c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,235 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:43:42 +DATE: 2025-10-11_16:36:29 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.979354e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.311142e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.251832e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.680186e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.389167e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.490052e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.493081 sec -INFO: No Floating Point Exceptions have been reported - 5,009,051,141 cycles # 2.916 GHz - 9,204,393,500 instructions # 1.84 insn per cycle - 1.774548277 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +TOTAL : 1.528523 sec + 5,119,450,809 cycles # 2.867 GHz + 9,180,981,618 instructions # 1.79 insn per cycle + 1.841912956 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 109 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.077151e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.276926e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276926e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.028340e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.213140e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.213140e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.211513 sec -INFO: No Floating Point Exceptions have been reported - 18,299,232,198 cycles # 2.944 GHz - 45,005,768,829 instructions # 2.46 insn per cycle - 6.217115880 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 411) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.495821 sec + 18,726,914,707 cycles # 2.881 GHz + 45,252,147,765 instructions # 2.42 insn per cycle + 6.501028276 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.268380e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.460029e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.460029e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.215291e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.366977e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.366977e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.148224 sec -INFO: No Floating Point Exceptions have been reported - 9,293,240,022 cycles # 2.948 GHz - 22,275,553,802 instructions # 2.40 insn per cycle - 3.153857529 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.221927 sec + 9,338,555,823 cycles # 2.895 GHz + 22,375,290,209 instructions # 2.40 insn per cycle + 3.227594710 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.395770e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.675698e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.675698e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.376691e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.618820e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.618820e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.994942 sec -INFO: No Floating Point Exceptions have been reported - 8,447,981,393 cycles # 2.817 GHz - 15,754,576,494 instructions # 1.86 insn per cycle - 3.000419944 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2565) (512y: 0) (512z: 0) +TOTAL : 3.021316 sec + 8,423,872,827 cycles # 2.784 GHz + 15,815,022,260 instructions # 1.88 insn per cycle + 3.026847541 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.419912e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.751119e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.751119e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.398006e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.678623e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.678623e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.971435 sec -INFO: No Floating Point Exceptions have been reported - 8,357,800,499 cycles # 2.808 GHz - 15,594,139,449 instructions # 1.87 insn per cycle - 2.977163262 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2462) (512y: 12) (512z: 0) +TOTAL : 3.003583 sec + 8,296,430,270 cycles # 2.758 GHz + 15,653,949,933 instructions # 1.89 insn per cycle + 3.009064332 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.455367e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.730952e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.730952e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.376583e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.598108e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598108e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.933639 sec -INFO: No Floating Point Exceptions have been reported - 6,669,997,057 cycles # 2.271 GHz - 12,867,351,511 instructions # 1.93 insn per cycle - 2.938851588 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1698) (512y: 16) (512z: 1440) +TOTAL : 3.029921 sec + 6,657,348,870 cycles # 2.194 GHz + 12,894,427,961 instructions # 1.94 insn per cycle + 3.035366895 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 9e915de581..1a227eb682 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_08:57:23 +DATE: 2025-10-11_15:16:39 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.310707e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.890276e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.030864e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.199628e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.780940e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.098104e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.577005 sec -INFO: No Floating Point Exceptions have been reported - 2,340,023,876 cycles # 2.880 GHz - 3,638,052,704 instructions # 1.55 insn per cycle - 0.886148283 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 79 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.592040 sec + 2,436,367,118 cycles # 2.822 GHz + 3,629,290,640 instructions # 1.49 insn per cycle + 0.920365880 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 72 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.074456e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.269687e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269687e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.039860e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.223391e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.223391e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.244443 sec -INFO: No Floating Point Exceptions have been reported - 18,377,232,357 cycles # 2.941 GHz - 45,025,324,964 instructions # 2.45 insn per cycle - 6.253002386 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 397) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.427980 sec + 18,659,345,357 cycles # 2.901 GHz + 45,239,622,020 instructions # 2.42 insn per cycle + 6.433370102 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 408) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.251309e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.439034e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.439034e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.201529e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.346468e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.346468e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.184453 sec -INFO: No Floating Point Exceptions have been reported - 9,383,250,913 cycles # 2.940 GHz - 22,280,358,761 instructions # 2.37 insn per cycle - 3.194375038 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1935) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.240561 sec + 9,296,413,050 cycles # 2.865 GHz + 22,342,996,788 instructions # 2.40 insn per cycle + 3.245872745 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1946) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.403334e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.700033e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.700033e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.385031e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.622316e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.622316e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.004384 sec -INFO: No Floating Point Exceptions have been reported - 8,513,730,278 cycles # 2.827 GHz - 15,791,909,505 instructions # 1.85 insn per cycle - 3.013283160 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2540) (512y: 0) (512z: 0) +TOTAL : 3.012220 sec + 8,383,528,688 cycles # 2.779 GHz + 15,803,482,216 instructions # 1.89 insn per cycle + 3.017661777 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.444935e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.799463e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.799463e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.412617e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.685973e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.685973e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.959849 sec -INFO: No Floating Point Exceptions have been reported - 8,395,161,248 cycles # 2.830 GHz - 15,634,676,534 instructions # 1.86 insn per cycle - 2.968734397 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2438) (512y: 10) (512z: 0) +TOTAL : 2.983146 sec + 8,252,716,563 cycles # 2.763 GHz + 15,642,709,201 instructions # 1.90 insn per cycle + 2.988589217 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2444) (512y: 10) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.454317e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.767111e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.767111e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.388549e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.619875e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.619875e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.955150 sec -INFO: No Floating Point Exceptions have been reported - 6,701,822,130 cycles # 2.263 GHz - 12,886,633,037 instructions # 1.92 insn per cycle - 2.963931226 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1669) (512y: 16) (512z: 1427) +TOTAL : 3.016137 sec + 6,649,228,149 cycles # 2.204 GHz + 12,869,205,720 instructions # 1.94 insn per cycle + 3.020818387 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1672) (512y: 5) (512z: 1432) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052564145764E-002 -Relative difference = 1.9988585667912256e-07 +Avg ME (F77/C++) = 1.2828052575059701E-002 +Relative difference = 2.0073664354238512e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 1fabc46555..38262df32b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:27:46 +DATE: 2025-10-11_16:17:26 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.309386e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.516838e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.621181e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.225159e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.730992e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.784746e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.581015 sec -INFO: No Floating Point Exceptions have been reported - 2,337,717,863 cycles # 2.893 GHz - 3,666,959,770 instructions # 1.57 insn per cycle - 0.866189287 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 109 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.586772 sec + 2,390,848,405 cycles # 2.830 GHz + 3,635,852,069 instructions # 1.52 insn per cycle + 0.901933192 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.617887e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.109367e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.109367e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.580341e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.051291e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.051291e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.275933 sec -INFO: No Floating Point Exceptions have been reported - 12,412,341,686 cycles # 2.900 GHz - 32,352,281,163 instructions # 2.61 insn per cycle - 4.283041784 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 290) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.360853 sec + 12,448,339,745 cycles # 2.853 GHz + 32,675,928,488 instructions # 2.62 insn per cycle + 4.365774305 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 289) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039840314887E-002 -Relative difference = 1.244813035273009e-08 +Avg ME (F77/C++) = 1.2828039845771855E-002 +Relative difference = 1.2022736589486635e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.642717e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.471061e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.471061e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.653591e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.483795e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.483795e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.775228 sec -INFO: No Floating Point Exceptions have been reported - 8,161,861,180 cycles # 2.934 GHz - 18,732,698,985 instructions # 2.30 insn per cycle - 2.782796507 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1534) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.750086 sec + 7,984,215,270 cycles # 2.899 GHz + 18,676,669,518 instructions # 2.34 insn per cycle + 2.755384632 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1518) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039283704129E-002 -Relative difference = 5.583829420356249e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.771950e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.635210e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.635210e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.732255e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.524982e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.524982e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.653741 sec -INFO: No Floating Point Exceptions have been reported - 7,565,022,779 cycles # 2.844 GHz - 14,293,093,213 instructions # 1.89 insn per cycle - 2.661141426 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2234) (512y: 0) (512z: 0) +TOTAL : 2.676787 sec + 7,485,834,946 cycles # 2.792 GHz + 14,289,880,775 instructions # 1.91 insn per cycle + 2.681721539 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2235) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053244447801E-002 -Relative difference = 2.5291823782248813e-07 +Avg ME (F77/C++) = 1.2828053277189611E-002 +Relative difference = 2.5547059841227576e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.799741e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.762487e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.762487e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.815938e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.713073e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.713073e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.634363 sec -INFO: No Floating Point Exceptions have been reported - 7,504,285,407 cycles # 2.842 GHz - 13,994,355,792 instructions # 1.86 insn per cycle - 2.641913370 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2087) (512y: 3) (512z: 0) +TOTAL : 2.610308 sec + 7,285,805,876 cycles # 2.787 GHz + 14,002,821,074 instructions # 1.92 insn per cycle + 2.615329640 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2090) (512y: 3) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053244447801E-002 -Relative difference = 2.5291823782248813e-07 +Avg ME (F77/C++) = 1.2828053277189611E-002 +Relative difference = 2.5547059841227576e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.507958e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.890935e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.890935e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.445558e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.751827e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.751827e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.900923 sec -INFO: No Floating Point Exceptions have been reported - 6,641,718,947 cycles # 2.284 GHz - 13,481,348,782 instructions # 2.03 insn per cycle - 2.908502130 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2073) (512y: 1) (512z: 1201) +TOTAL : 2.952535 sec + 6,541,372,214 cycles # 2.212 GHz + 13,442,784,339 instructions # 2.06 insn per cycle + 2.957547644 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2077) (512y: 0) (512z: 1195) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052562326775E-002 -Relative difference = 1.997440588685788e-07 +Avg ME (F77/C++) = 1.2828052571421722E-002 +Relative difference = 2.004530479212976e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index ddc690e546..47c3a6f771 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:28:10 +DATE: 2025-10-11_16:17:52 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.311525e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.893939e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.130206e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.230358e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.785974e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.903505e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.580982 sec -INFO: No Floating Point Exceptions have been reported - 2,326,498,884 cycles # 2.887 GHz - 3,595,400,053 instructions # 1.55 insn per cycle - 0.865243472 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 79 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.585637 sec + 2,395,685,093 cycles # 2.840 GHz + 3,632,202,579 instructions # 1.52 insn per cycle + 0.900792937 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 72 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.199736e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.210916e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.210916e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.167434e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.153946e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.153946e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.247253 sec -INFO: No Floating Point Exceptions have been reported - 9,460,485,661 cycles # 2.907 GHz - 25,749,028,052 instructions # 2.72 insn per cycle - 3.254869601 seconds time elapsed +TOTAL : 3.280436 sec + 9,351,045,236 cycles # 2.847 GHz + 25,523,046,940 instructions # 2.73 insn per cycle + 3.285902426 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 243) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039838495897E-002 -Relative difference = 1.2589928273811243e-08 +Avg ME (F77/C++) = 1.2828039845771855E-002 +Relative difference = 1.2022736589486635e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.982142e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.480555e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.480555e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.975132e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.504192e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.504192e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.498717 sec -INFO: No Floating Point Exceptions have been reported - 7,385,528,393 cycles # 2.949 GHz - 16,812,365,380 instructions # 2.28 insn per cycle - 2.506313604 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1311) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.494622 sec + 7,225,776,791 cycles # 2.892 GHz + 16,897,519,367 instructions # 2.34 insn per cycle + 2.499894449 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.917887e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.065921e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.065921e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.863069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.858307e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.858307e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.542096 sec -INFO: No Floating Point Exceptions have been reported - 7,260,793,625 cycles # 2.848 GHz - 13,703,433,227 instructions # 1.89 insn per cycle - 2.549878549 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2067) (512y: 0) (512z: 0) +TOTAL : 2.571321 sec + 7,197,624,768 cycles # 2.795 GHz + 13,687,331,488 instructions # 1.90 insn per cycle + 2.576243151 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2063) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053220800939E-002 Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.947392e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.166768e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.166768e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.912761e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.069621e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.069621e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.537410 sec -INFO: No Floating Point Exceptions have been reported - 7,253,478,894 cycles # 2.851 GHz - 13,505,585,795 instructions # 1.86 insn per cycle - 2.545044336 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1935) (512y: 7) (512z: 0) +TOTAL : 2.533153 sec + 7,100,141,299 cycles # 2.799 GHz + 13,497,970,451 instructions # 1.90 insn per cycle + 2.538056554 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1946) (512y: 3) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053220800939E-002 Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.612725e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.139660e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.139660e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.512964e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.923122e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.923122e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.798296 sec -INFO: No Floating Point Exceptions have been reported - 6,447,529,861 cycles # 2.298 GHz - 13,215,855,857 instructions # 2.05 insn per cycle - 2.806480502 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2039) (512y: 2) (512z: 1081) +TOTAL : 2.885451 sec + 6,375,003,514 cycles # 2.206 GHz + 13,181,689,692 instructions # 2.07 insn per cycle + 2.890749023 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2031) (512y: 1) (512z: 1091) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052536860923E-002 Relative difference = 1.977588895209662e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling new file mode 100644 index 0000000000..78116e7085 --- /dev/null +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_15:39:57 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.811025e+06 1 256 +5.675268e+06 2 256 +1.125473e+07 4 256 +2.237542e+07 8 256 +4.084889e+07 16 256 +8.038307e+07 32 256 +1.408431e+08 64 256 +2.087041e+08 128 256 +2.617085e+08 256 256 +3.164102e+08 512 256 +3.490720e+08 1024 256 +### GPU: scaling test 32 +3.990821e+05 1 32 +7.057552e+05 2 32 +1.416039e+06 4 32 +2.964129e+06 8 32 +5.593795e+06 16 32 +1.165053e+07 32 32 +2.163693e+07 64 32 +4.137165e+07 128 32 +7.520702e+07 256 32 +1.314590e+08 512 32 +1.948562e+08 1024 32 +2.786288e+08 2048 32 +3.116503e+08 4096 32 +3.644493e+08 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.058031e+06 1 256 +1.064708e+06 2 256 +1.091924e+06 4 256 +### CPU: scaling test 32 +9.653674e+05 1 32 +1.073826e+06 2 32 +1.086320e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.851906e+06 1 256 +1.832695e+06 2 256 +1.916161e+06 4 256 +### CPU: scaling test 32 +1.906351e+06 1 32 +1.246470e+06 2 32 +1.664802e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.709626e+06 1 256 +2.644942e+06 2 256 +2.445350e+06 4 256 +### CPU: scaling test 32 +2.186539e+06 1 32 +2.363281e+06 2 32 +2.641954e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.767179e+06 1 256 +2.686691e+06 2 256 +2.759654e+06 4 256 +### CPU: scaling test 32 +1.340876e+06 1 32 +2.416645e+06 2 32 +2.506708e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.171313e+06 1 256 +2.276072e+06 2 256 +2.282286e+06 4 256 +### CPU: scaling test 32 +1.265823e+06 1 32 +1.671673e+06 2 32 +2.039028e+06 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 8e00f9820d..caf7cf3a58 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_08:55:54 +DATE: 2025-10-11_15:14:54 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.055673e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.658424e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.851508e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.254014e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.994980e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.902542e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.678781 sec -INFO: No Floating Point Exceptions have been reported - 2,628,768,348 cycles # 2.876 GHz - 4,103,389,790 instructions # 1.56 insn per cycle - 1.044225431 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.693324 sec + 2,725,071,311 cycles # 2.836 GHz + 4,080,796,637 instructions # 1.50 insn per cycle + 1.023122717 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039901590279E-002 -Relative difference = 7.671454200650844e-09 +Avg ME (F77/GPU) = 1.2828039945363461E-002 +Relative difference = 4.259149494690016e-09 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.011376e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.175905e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.175905e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.004559e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.167053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.167053e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.671756 sec -INFO: No Floating Point Exceptions have been reported - 19,661,999,702 cycles # 2.943 GHz - 46,395,546,050 instructions # 2.36 insn per cycle - 6.683261433 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.681187 sec + 19,310,569,163 cycles # 2.888 GHz + 46,561,074,047 instructions # 2.41 insn per cycle + 6.686779372 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.631538e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.161697e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.161697e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.592071e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.095366e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.095366e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.308846 sec -INFO: No Floating Point Exceptions have been reported - 12,713,127,116 cycles # 2.944 GHz - 31,571,564,120 instructions # 2.48 insn per cycle - 4.322869208 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1731) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.374152 sec + 12,572,513,674 cycles # 2.872 GHz + 31,463,286,168 instructions # 2.50 insn per cycle + 4.379862583 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1723) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.963768e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.746755e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.746755e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.938324e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.700921e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.700921e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.649356 sec -INFO: No Floating Point Exceptions have been reported - 10,294,572,937 cycles # 2.814 GHz - 19,586,622,017 instructions # 1.90 insn per cycle - 3.662289672 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2045) (512y: 0) (512z: 0) +TOTAL : 3.662440 sec + 10,121,778,715 cycles # 2.760 GHz + 19,471,159,122 instructions # 1.92 insn per cycle + 3.668260640 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.001856e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.818080e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.818080e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.971771e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.738449e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.738449e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.589952 sec -INFO: No Floating Point Exceptions have been reported - 10,108,826,304 cycles # 2.808 GHz - 19,396,692,714 instructions # 1.92 insn per cycle - 3.602641354 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1799) (512y: 188) (512z: 0) +TOTAL : 3.605464 sec + 9,883,989,440 cycles # 2.738 GHz + 19,284,997,724 instructions # 1.95 insn per cycle + 3.611144081 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1786) (512y: 191) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.801777e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.420597e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.420597e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.763507e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.351410e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.351410e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.939332 sec -INFO: No Floating Point Exceptions have been reported - 8,555,878,739 cycles # 2.167 GHz - 15,216,666,169 instructions # 1.78 insn per cycle - 3.951287451 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 966) (512y: 154) (512z: 1330) +TOTAL : 3.983402 sec + 8,347,852,448 cycles # 2.093 GHz + 14,994,758,047 instructions # 1.80 insn per cycle + 3.989072483 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 952) (512y: 154) (512z: 1313) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 0283d4438d..f781dc1bb5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_08:56:25 +DATE: 2025-10-11_15:15:31 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.048170e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.671940e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.867900e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.263252e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.017320e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.920339e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.677955 sec -INFO: No Floating Point Exceptions have been reported - 2,610,429,449 cycles # 2.847 GHz - 4,074,904,816 instructions # 1.56 insn per cycle - 1.028610198 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.689357 sec + 2,740,273,431 cycles # 2.852 GHz + 4,084,188,832 instructions # 1.49 insn per cycle + 1.021206637 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039901590279E-002 -Relative difference = 7.671454200650844e-09 +Avg ME (F77/GPU) = 1.2828039945363461E-002 +Relative difference = 4.259149494690016e-09 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.012794e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.178467e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.178467e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.004380e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.167437e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.167437e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.662178 sec -INFO: No Floating Point Exceptions have been reported - 19,608,707,308 cycles # 2.939 GHz - 46,331,953,932 instructions # 2.36 insn per cycle - 6.674225175 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 453) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.681530 sec + 19,329,038,472 cycles # 2.891 GHz + 46,534,784,670 instructions # 2.41 insn per cycle + 6.687165929 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.631371e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.156116e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.156116e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.608782e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.123511e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.123511e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.305744 sec -INFO: No Floating Point Exceptions have been reported - 12,687,194,497 cycles # 2.940 GHz - 31,570,654,619 instructions # 2.49 insn per cycle - 4.317357131 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1724) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.330389 sec + 12,526,304,265 cycles # 2.890 GHz + 31,429,125,016 instructions # 2.51 insn per cycle + 4.336065673 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.951503e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.723168e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.723168e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.942808e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.702933e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.702933e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.669508 sec -INFO: No Floating Point Exceptions have been reported - 10,337,023,986 cycles # 2.809 GHz - 19,600,398,756 instructions # 1.90 insn per cycle - 3.680210311 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2036) (512y: 0) (512z: 0) +TOTAL : 3.652389 sec + 10,126,359,115 cycles # 2.769 GHz + 19,454,993,368 instructions # 1.92 insn per cycle + 3.658235344 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2019) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.000628e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.813640e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.813640e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.957600e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.738598e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.738598e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.591164 sec -INFO: No Floating Point Exceptions have been reported - 10,093,463,938 cycles # 2.804 GHz - 19,298,137,282 instructions # 1.91 insn per cycle - 3.601580555 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1766) (512y: 191) (512z: 0) +TOTAL : 3.629719 sec + 9,979,298,276 cycles # 2.746 GHz + 19,273,169,438 instructions # 1.93 insn per cycle + 3.635438116 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1773) (512y: 191) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.833398e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483164e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.483164e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.800984e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.418771e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.418771e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.878021 sec -INFO: No Floating Point Exceptions have been reported - 8,399,559,009 cycles # 2.161 GHz - 15,073,176,103 instructions # 1.79 insn per cycle - 3.888708235 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 959) (512y: 155) (512z: 1296) +TOTAL : 3.911829 sec + 8,199,622,084 cycles # 2.094 GHz + 14,847,008,944 instructions # 1.81 insn per cycle + 3.917306895 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 941) (512y: 155) (512z: 1281) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling new file mode 100644 index 0000000000..4703fd43b7 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:40:39 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.383253e+06 1 256 +2.893064e+06 2 256 +5.376118e+06 4 256 +1.185151e+07 8 256 +2.346081e+07 16 256 +4.511286e+07 32 256 +5.630221e+07 64 256 +6.196121e+07 128 256 +6.780047e+07 256 256 +7.309787e+07 512 256 +7.376814e+07 1024 256 +### GPU: scaling test 32 +1.722124e+05 1 32 +3.905487e+05 2 32 +6.832898e+05 4 32 +1.517739e+06 8 32 +2.835858e+06 16 32 +6.130048e+06 32 32 +1.120344e+07 64 32 +2.084478e+07 128 32 +4.106718e+07 256 32 +5.763008e+07 512 32 +6.090072e+07 1024 32 +6.706632e+07 2048 32 +7.231618e+07 4096 32 +7.501823e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.767984e+05 1 256 +1.796605e+05 2 256 +1.802476e+05 4 256 +### CPU: scaling test 32 +1.472612e+05 1 32 +1.715919e+05 2 32 +1.711413e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.982512e+05 1 256 +3.086531e+05 2 256 +3.162558e+05 4 256 +### CPU: scaling test 32 +2.995750e+05 1 32 +2.938112e+05 2 32 +2.996907e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.811704e+05 1 256 +4.983434e+05 2 256 +5.240082e+05 4 256 +### CPU: scaling test 32 +4.296686e+05 1 32 +4.897722e+05 2 32 +4.790509e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.039122e+05 1 256 +5.537973e+05 2 256 +5.292318e+05 4 256 +### CPU: scaling test 32 +5.049628e+05 1 32 +5.163039e+05 2 32 +5.558813e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.352738e+05 1 256 +3.531052e+05 2 256 +3.524363e+05 4 256 +### CPU: scaling test 32 +3.508580e+05 1 32 +3.508926e+05 2 32 +3.509426e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 0abecbd859..b83fe948f8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_08:57:50 +DATE: 2025-10-11_15:17:08 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.424562e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.378226e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.000814e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.814869e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.187282e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.582493e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.532719 sec -INFO: No Floating Point Exceptions have been reported - 2,198,564,055 cycles # 2.860 GHz - 3,137,529,593 instructions # 1.43 insn per cycle - 0.850854779 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.541191 sec + 2,309,968,372 cycles # 2.848 GHz + 3,226,495,089 instructions # 1.40 insn per cycle + 0.869698260 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.821542e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.869016e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.869016e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.792870e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.839272e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.839272e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.902388 sec -INFO: No Floating Point Exceptions have been reported - 17,373,663,633 cycles # 2.939 GHz - 46,051,346,456 instructions # 2.65 insn per cycle - 5.916149203 seconds time elapsed +TOTAL : 5.956913 sec + 17,261,214,247 cycles # 2.896 GHz + 46,320,121,297 instructions # 2.68 insn per cycle + 5.962421755 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.199984e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.364044e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.364044e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.087487e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.238823e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.238823e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.425369 sec -INFO: No Floating Point Exceptions have been reported - 10,116,123,100 cycles # 2.945 GHz - 27,968,506,728 instructions # 2.76 insn per cycle - 3.436971917 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.506189 sec + 10,088,639,728 cycles # 2.873 GHz + 27,919,288,717 instructions # 2.77 insn per cycle + 3.512045055 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.021241e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.422127e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.422127e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.914379e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.288444e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.288444e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.236686 sec -INFO: No Floating Point Exceptions have been reported - 6,226,726,050 cycles # 2.773 GHz - 12,700,169,832 instructions # 2.04 insn per cycle - 2.249020906 seconds time elapsed +TOTAL : 2.241997 sec + 6,102,243,675 cycles # 2.716 GHz + 12,609,784,840 instructions # 2.07 insn per cycle + 2.247857659 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.518459e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.996461e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.996461e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.130809e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.541182e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.541182e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.046768 sec -INFO: No Floating Point Exceptions have been reported - 5,709,909,658 cycles # 2.777 GHz - 12,140,194,379 instructions # 2.13 insn per cycle - 2.059786524 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2360) (512y: 144) (512z: 0) +TOTAL : 2.151754 sec + 5,849,443,539 cycles # 2.712 GHz + 12,186,163,621 instructions # 2.08 insn per cycle + 2.157524773 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.403513e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.583329e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.583329e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.453655e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.631223e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.631223e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.229693 sec -INFO: No Floating Point Exceptions have been reported - 6,051,702,488 cycles # 1.869 GHz - 8,428,750,265 instructions # 1.39 insn per cycle - 3.242969033 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1441) (512y: 122) (512z: 1802) +TOTAL : 3.144840 sec + 5,734,260,839 cycles # 1.821 GHz + 8,277,135,516 instructions # 1.44 insn per cycle + 3.150611128 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..28ed30edba --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:54:51 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +4.305698e+05 1 256 +8.421080e+05 2 256 +1.658112e+06 4 256 +2.989838e+06 8 256 +4.972377e+06 16 256 +7.105357e+06 32 256 +9.196651e+06 64 256 +1.028995e+07 128 256 +1.118682e+07 256 256 +1.170520e+07 512 256 +1.194760e+07 1024 256 +### GPU: scaling test 32 +5.803167e+04 1 32 +1.141868e+05 2 32 +2.280709e+05 4 32 +4.392090e+05 8 32 +8.271820e+05 16 32 +1.628245e+06 32 32 +3.150764e+06 64 32 +5.031576e+06 128 32 +7.100399e+06 256 32 +9.298129e+06 512 32 +1.037459e+07 1024 32 +1.113939e+07 2048 32 +1.172028e+07 4096 32 +1.198120e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.715304e+05 1 256 +1.781417e+05 2 256 +1.794714e+05 4 256 +### CPU: scaling test 32 +1.577069e+05 1 32 +1.683648e+05 2 32 +1.674260e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.985670e+05 1 256 +3.075757e+05 2 256 +3.131579e+05 4 256 +### CPU: scaling test 32 +2.725469e+05 1 32 +2.816294e+05 2 32 +2.958942e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.247762e+05 1 256 +5.241155e+05 2 256 +4.852917e+05 4 256 +### CPU: scaling test 32 +5.186974e+05 1 32 +5.291399e+05 2 32 +5.305920e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.514805e+05 1 256 +5.505359e+05 2 256 +5.563984e+05 4 256 +### CPU: scaling test 32 +5.060969e+05 1 32 +5.545783e+05 2 32 +4.913100e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.339783e+05 1 256 +3.535899e+05 2 256 +3.481939e+05 4 256 +### CPU: scaling test 32 +3.145334e+05 1 32 +3.563455e+05 2 32 +3.387686e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt new file mode 100644 index 0000000000..898eec66e3 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt @@ -0,0 +1,223 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:50:32 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.041344e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200767e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.210879e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.316417 sec + 4,841,050,091 cycles # 2.845 GHz + 6,855,412,132 instructions # 1.42 insn per cycle + 1.762497593 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028807e+00 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.782393e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.828671e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.828671e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.991425 sec + 17,268,124,515 cycles # 2.880 GHz + 46,321,023,545 instructions # 2.68 insn per cycle + 5.996950400 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.120284e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.273768e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.273768e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.468964 sec + 10,062,208,508 cycles # 2.897 GHz + 27,919,768,700 instructions # 2.77 insn per cycle + 3.474512429 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388515654 +Relative difference = 3.2588039900609506e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.922035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.300092e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.300092e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.238317 sec + 6,090,888,500 cycles # 2.716 GHz + 12,608,791,480 instructions # 2.07 insn per cycle + 2.243747530 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.153909e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.564898e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.564898e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.141769 sec + 5,839,015,371 cycles # 2.721 GHz + 12,183,200,067 instructions # 2.09 insn per cycle + 2.147164385 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.421281e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.595508e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.595508e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.172923 sec + 5,704,193,065 cycles # 1.795 GHz + 8,277,048,290 instructions # 1.45 insn per cycle + 3.178502846 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 0a62f31f21..8fbb21e9ff 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,252 +10,216 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:37:36 +DATE: 2025-10-11_16:28:38 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.523249e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.008578e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.008578e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.427555e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.769300e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.769300e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.943118 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 3,438,006,415 cycles # 2.887 GHz - 4,812,518,572 instructions # 1.40 insn per cycle - 1.248014993 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +TOTAL : 0.828718 sec + 3,186,820,693 cycles # 2.852 GHz + 4,808,126,394 instructions # 1.51 insn per cycle + 1.176249753 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.806787e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.852935e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.852935e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.774052e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.819717e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.819717e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.028463 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 17,701,660,691 cycles # 2.931 GHz - 46,100,592,443 instructions # 2.60 insn per cycle - 6.041454793 seconds time elapsed +TOTAL : 6.098613 sec + 17,597,864,140 cycles # 2.883 GHz + 46,380,415,047 instructions # 2.64 insn per cycle + 6.105859903 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.171570e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.328412e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.328412e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.088043e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.238153e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.238153e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.537488 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 10,436,410,766 cycles # 2.940 GHz - 28,150,415,987 instructions # 2.70 insn per cycle - 3.550700440 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.585879 sec + 10,400,318,731 cycles # 2.896 GHz + 28,093,070,719 instructions # 2.70 insn per cycle + 3.593178065 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.940586e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.316252e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.316252e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.807610e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.170791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.170791e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.355700 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,586,554,223 cycles # 2.781 GHz - 12,999,619,553 instructions # 1.97 insn per cycle - 2.369192751 seconds time elapsed +TOTAL : 2.371916 sec + 6,428,829,911 cycles # 2.703 GHz + 12,887,812,684 instructions # 2.00 insn per cycle + 2.379156266 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.425137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.877080e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.877080e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.017593e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.406809e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.406809e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.160954 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,058,497,746 cycles # 2.788 GHz - 12,422,408,910 instructions # 2.05 insn per cycle - 2.174009213 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2360) (512y: 144) (512z: 0) +TOTAL : 2.281231 sec + 6,165,327,004 cycles # 2.695 GHz + 12,463,334,301 instructions # 2.02 insn per cycle + 2.288346369 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.454260e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.633384e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.633384e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.356453e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.524615e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.524615e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.271770 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,220,081,356 cycles # 1.894 GHz - 8,655,636,644 instructions # 1.39 insn per cycle - 3.285127387 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1441) (512y: 122) (512z: 1802) +TOTAL : 3.315612 sec + 6,121,266,749 cycles # 1.843 GHz + 8,516,898,541 instructions # 1.39 insn per cycle + 3.322530830 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 70d02af695..26e0f25894 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:49:47 +DATE: 2025-10-11_16:44:00 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.202403e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.187841e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.877468e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.725056e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.186541e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.580567e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.637594 sec -INFO: No Floating Point Exceptions have been reported - 2,481,390,363 cycles # 2.852 GHz - 3,619,998,982 instructions # 1.46 insn per cycle - 0.928734017 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.638610 sec + 2,571,549,393 cycles # 2.847 GHz + 3,659,796,797 instructions # 1.42 insn per cycle + 0.960427498 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.808108e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.854363e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.854363e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.781185e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.826305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.826305e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.981515 sec -INFO: No Floating Point Exceptions have been reported - 17,441,882,337 cycles # 2.914 GHz - 45,980,812,555 instructions # 2.64 insn per cycle - 5.987317462 seconds time elapsed +TOTAL : 6.057966 sec + 17,438,379,118 cycles # 2.877 GHz + 46,337,653,518 instructions # 2.66 insn per cycle + 6.063608366 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.173867e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.332553e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.332553e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.115210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.268081e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.268081e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.490197 sec -INFO: No Floating Point Exceptions have been reported - 10,215,611,800 cycles # 2.923 GHz - 27,889,324,001 instructions # 2.73 insn per cycle - 3.495993800 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.536392 sec + 10,229,702,343 cycles # 2.889 GHz + 27,918,943,570 instructions # 2.73 insn per cycle + 3.542208033 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.999819e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.389873e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.389873e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.877271e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.247954e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.247954e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.281339 sec -INFO: No Floating Point Exceptions have been reported - 6,287,168,374 cycles # 2.750 GHz - 12,602,929,813 instructions # 2.00 insn per cycle - 2.287435325 seconds time elapsed +TOTAL : 2.320644 sec + 6,288,847,916 cycles # 2.704 GHz + 12,592,903,872 instructions # 2.00 insn per cycle + 2.326302778 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.471434e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.936245e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.936245e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.123817e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.531393e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.531393e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.097327 sec -INFO: No Floating Point Exceptions have been reported - 5,814,420,150 cycles # 2.765 GHz - 11,994,829,914 instructions # 2.06 insn per cycle - 2.103345298 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2360) (512y: 144) (512z: 0) +TOTAL : 2.218321 sec + 6,014,515,797 cycles # 2.706 GHz + 12,133,309,602 instructions # 2.02 insn per cycle + 2.224085333 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.462865e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.641783e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.641783e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.381723e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.553268e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.553268e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.218108 sec -INFO: No Floating Point Exceptions have been reported - 5,937,437,503 cycles # 1.843 GHz - 8,290,568,638 instructions # 1.40 insn per cycle - 3.224462086 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1441) (512y: 122) (512z: 1802) +TOTAL : 3.273257 sec + 5,933,511,412 cycles # 1.811 GHz + 8,229,034,215 instructions # 1.39 insn per cycle + 3.278919832 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 794a3c9310..4d5855b54d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:46:56 +DATE: 2025-10-11_16:40:27 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.311257e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.342288e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.004457e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.767730e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.205228e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.589097e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.566605 sec -INFO: No Floating Point Exceptions have been reported - 2,313,605,054 cycles # 2.893 GHz - 3,600,350,267 instructions # 1.56 insn per cycle - 0.856648834 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.575649 sec + 2,386,111,811 cycles # 2.845 GHz + 3,639,741,256 instructions # 1.53 insn per cycle + 0.895952286 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.824387e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.871256e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.871256e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.791051e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.837013e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.837013e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.855309 sec -INFO: No Floating Point Exceptions have been reported - 17,230,682,954 cycles # 2.940 GHz - 45,932,528,772 instructions # 2.67 insn per cycle - 5.861424268 seconds time elapsed +TOTAL : 5.963163 sec + 17,264,643,304 cycles # 2.893 GHz + 46,321,097,140 instructions # 2.68 insn per cycle + 5.968989618 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.215073e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.378302e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.378302e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.101295e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.253753e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.253753e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.372268 sec -INFO: No Floating Point Exceptions have been reported - 9,959,367,668 cycles # 2.949 GHz - 27,848,270,798 instructions # 2.80 insn per cycle - 3.378265573 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.491410 sec + 10,059,054,482 cycles # 2.877 GHz + 27,919,466,540 instructions # 2.78 insn per cycle + 3.497008176 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.999546e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.391220e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.391220e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.890079e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.263113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.263113e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.206484 sec -INFO: No Floating Point Exceptions have been reported - 6,113,930,208 cycles # 2.765 GHz - 12,581,849,902 instructions # 2.06 insn per cycle - 2.212402360 seconds time elapsed +TOTAL : 2.254459 sec + 6,084,381,375 cycles # 2.693 GHz + 12,610,002,661 instructions # 2.07 insn per cycle + 2.260263260 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.516180e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.984165e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.984165e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.141713e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.554289e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.554289e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.007710 sec -INFO: No Floating Point Exceptions have been reported - 5,576,628,773 cycles # 2.771 GHz - 12,020,299,868 instructions # 2.16 insn per cycle - 2.013581558 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2360) (512y: 144) (512z: 0) +TOTAL : 2.147865 sec + 5,852,500,330 cycles # 2.720 GHz + 12,186,332,321 instructions # 2.08 insn per cycle + 2.153550767 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.502286e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.687963e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.687963e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.413552e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.588205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.588205e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.102138 sec -INFO: No Floating Point Exceptions have been reported - 5,751,986,200 cycles # 1.852 GHz - 8,297,969,466 instructions # 1.44 insn per cycle - 3.107697215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1441) (512y: 122) (512z: 1802) +TOTAL : 3.180124 sec + 5,723,407,148 cycles # 1.797 GHz + 8,277,947,646 instructions # 1.45 insn per cycle + 3.185775207 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt new file mode 100644 index 0000000000..4b28e0c827 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt @@ -0,0 +1,223 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasNoBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasNoBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:49:10 + +HASBLAS=hasNoBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.755096e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.215389e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.607884e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.539292 sec + 2,216,200,050 cycles # 2.846 GHz + 3,157,615,309 instructions # 1.42 insn per cycle + 0.835257331 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028807e+00 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.787183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.832888e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.832888e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.975964 sec + 17,260,345,803 cycles # 2.886 GHz + 46,320,336,029 instructions # 2.68 insn per cycle + 5.981639118 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.111247e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.265577e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.265577e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.479269 sec + 10,044,184,434 cycles # 2.883 GHz + 27,919,122,564 instructions # 2.78 insn per cycle + 3.485095741 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388515654 +Relative difference = 3.2588039900609506e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.905590e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.283676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.283676e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.245986 sec + 6,089,248,282 cycles # 2.705 GHz + 12,609,705,263 instructions # 2.07 insn per cycle + 2.251881277 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.148141e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.559740e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.559740e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.144804 sec + 5,824,946,914 cycles # 2.710 GHz + 12,184,657,847 instructions # 2.09 insn per cycle + 2.150527846 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.423895e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.599460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.599460e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.171890 sec + 5,741,396,850 cycles # 1.808 GHz + 8,278,034,433 instructions # 1.44 insn per cycle + 3.177718293 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 70a45db399..e5e06f1218 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,235 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:44:10 +DATE: 2025-10-11_16:37:03 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.785807e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.291280e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.973584e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.626435e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.214094e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.587498e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.717756 sec -INFO: No Floating Point Exceptions have been reported - 2,755,914,027 cycles # 2.900 GHz - 4,368,405,962 instructions # 1.59 insn per cycle - 1.007006361 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +TOTAL : 0.726364 sec + 2,849,514,717 cycles # 2.845 GHz + 4,382,574,758 instructions # 1.54 insn per cycle + 1.057928884 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.829948e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.877608e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.877608e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.789888e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.835303e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.835303e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.839744 sec -INFO: No Floating Point Exceptions have been reported - 17,231,514,699 cycles # 2.948 GHz - 45,931,758,909 instructions # 2.67 insn per cycle - 5.845651027 seconds time elapsed +TOTAL : 5.967334 sec + 17,272,703,409 cycles # 2.893 GHz + 46,321,862,531 instructions # 2.68 insn per cycle + 5.973038452 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.215717e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.376174e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.376174e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.088498e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.238712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.238712e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.370523 sec -INFO: No Floating Point Exceptions have been reported - 9,939,666,586 cycles # 2.945 GHz - 27,847,302,489 instructions # 2.80 insn per cycle - 3.376515027 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.504822 sec + 10,065,494,953 cycles # 2.868 GHz + 27,919,546,717 instructions # 2.77 insn per cycle + 3.510554362 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.058902e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.451650e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.451650e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.895401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.272281e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.272281e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.181386 sec -INFO: No Floating Point Exceptions have been reported - 6,074,037,919 cycles # 2.778 GHz - 12,580,567,087 instructions # 2.07 insn per cycle - 2.187203017 seconds time elapsed +TOTAL : 2.251790 sec + 6,086,448,139 cycles # 2.697 GHz + 12,610,253,243 instructions # 2.07 insn per cycle + 2.257658692 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.484469e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.947491e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.947491e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.104544e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.508827e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.508827e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.020942 sec -INFO: No Floating Point Exceptions have been reported - 5,589,694,694 cycles # 2.759 GHz - 12,020,772,424 instructions # 2.15 insn per cycle - 2.026934215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2360) (512y: 144) (512z: 0) +TOTAL : 2.163370 sec + 5,848,310,473 cycles # 2.697 GHz + 12,186,147,335 instructions # 2.08 insn per cycle + 2.169166916 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.541083e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.728456e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.728456e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.395329e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.569447e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.569447e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.072814 sec -INFO: No Floating Point Exceptions have been reported - 5,724,538,871 cycles # 1.860 GHz - 8,297,304,281 instructions # 1.45 insn per cycle - 3.079169559 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1441) (512y: 122) (512z: 1802) +TOTAL : 3.198349 sec + 5,734,393,208 cycles # 1.791 GHz + 8,277,908,197 instructions # 1.44 insn per cycle + 3.204254400 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 03be4a726d..09986e5034 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_08:58:15 +DATE: 2025-10-11_15:17:41 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.508928e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.321752e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.002344e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.740251e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.070566e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.446622e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.536365 sec -INFO: No Floating Point Exceptions have been reported - 2,214,194,265 cycles # 2.876 GHz - 3,152,115,430 instructions # 1.42 insn per cycle - 0.834564895 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 212 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.542467 sec + 2,308,061,310 cycles # 2.843 GHz + 3,180,365,192 instructions # 1.38 insn per cycle + 0.870299018 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.855453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.904405e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.904405e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.832732e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.880113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.880113e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.800560 sec -INFO: No Floating Point Exceptions have been reported - 16,903,949,090 cycles # 2.909 GHz - 45,043,853,273 instructions # 2.66 insn per cycle - 5.813534817 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.829901 sec + 16,848,535,293 cycles # 2.888 GHz + 45,296,509,977 instructions # 2.69 insn per cycle + 5.835776505 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 568) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.339712e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.518637e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.518637e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.271423e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.440008e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.440008e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.288467 sec -INFO: No Floating Point Exceptions have been reported - 9,645,043,566 cycles # 2.925 GHz - 26,807,862,552 instructions # 2.78 insn per cycle - 3.301069690 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2327) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.314065 sec + 9,572,123,137 cycles # 2.885 GHz + 26,751,815,901 instructions # 2.79 insn per cycle + 3.319563861 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2313) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.590385e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.923511e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.923511e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.514184e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.827414e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.827414e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.431911 sec -INFO: No Floating Point Exceptions have been reported - 6,762,097,168 cycles # 2.769 GHz - 14,239,182,198 instructions # 2.11 insn per cycle - 2.443454156 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) +TOTAL : 2.431404 sec + 6,623,808,841 cycles # 2.719 GHz + 14,177,690,165 instructions # 2.14 insn per cycle + 2.437208264 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2724) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.784038e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.137564e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.137564e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.701345e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.040507e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.040507e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.339078 sec -INFO: No Floating Point Exceptions have been reported - 6,493,835,738 cycles # 2.765 GHz - 13,835,177,964 instructions # 2.13 insn per cycle - 2.350490634 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2360) (512y: 298) (512z: 0) +TOTAL : 2.338470 sec + 6,401,665,095 cycles # 2.732 GHz + 13,769,940,318 instructions # 2.15 insn per cycle + 2.344318448 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2371) (512y: 297) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.400894e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.576119e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.576119e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.303189e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.466084e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.466084e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.231977 sec -INFO: No Floating Point Exceptions have been reported - 6,054,126,925 cycles # 1.868 GHz - 10,181,313,288 instructions # 1.68 insn per cycle - 3.245420113 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1273) (512y: 208) (512z: 1988) +TOTAL : 3.283375 sec + 5,957,178,129 cycles # 1.812 GHz + 10,086,124,192 instructions # 1.69 insn per cycle + 3.289028880 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1276) (512y: 208) (512z: 1988) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index f94c1448dd..0d42001848 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:28:32 +DATE: 2025-10-11_16:18:17 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.445619e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.389644e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.998797e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.785771e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.171465e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.568632e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.532166 sec -INFO: No Floating Point Exceptions have been reported - 2,223,705,741 cycles # 2.888 GHz - 3,137,862,648 instructions # 1.41 insn per cycle - 0.826622030 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.539437 sec + 2,324,660,140 cycles # 2.833 GHz + 3,221,828,743 instructions # 1.39 insn per cycle + 0.878217469 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.243473e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.316891e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.316891e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.387107e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.469288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.469288e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.822701 sec -INFO: No Floating Point Exceptions have been reported - 14,262,425,677 cycles # 2.951 GHz - 34,462,229,045 instructions # 2.42 insn per cycle - 4.834685593 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 665) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.501541 sec + 13,071,399,497 cycles # 2.901 GHz + 34,739,078,110 instructions # 2.66 insn per cycle + 4.507191858 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 648) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515654 +Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.991823e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.134338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.134338e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.901021e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.033616e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.033616e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.653458 sec -INFO: No Floating Point Exceptions have been reported - 10,828,452,798 cycles # 2.955 GHz - 24,364,594,695 instructions # 2.25 insn per cycle - 3.665357624 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2610) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.723435 sec + 10,832,687,449 cycles # 2.906 GHz + 24,282,426,073 instructions # 2.24 insn per cycle + 3.728894903 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2579) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515654 -Relative difference = 3.2588039900609506e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.588361e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.923011e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.923011e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.388729e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.690145e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.690145e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.432860 sec -INFO: No Floating Point Exceptions have been reported - 6,763,126,248 cycles # 2.768 GHz - 12,520,790,366 instructions # 1.85 insn per cycle - 2.444836798 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3115) (512y: 0) (512z: 0) +TOTAL : 2.497295 sec + 6,743,813,449 cycles # 2.696 GHz + 12,543,269,382 instructions # 1.86 insn per cycle + 2.502704497 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3156) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 +Avg ME (F77/C++) = 2.0288063388516209 +Relative difference = 3.258803716446205e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.983949e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.371900e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.371900e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.651146e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.006867e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.006867e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.251146 sec -INFO: No Floating Point Exceptions have been reported - 6,291,656,449 cycles # 2.782 GHz - 11,662,894,163 instructions # 1.85 insn per cycle - 2.263135736 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2644) (512y: 239) (512z: 0) +TOTAL : 2.362181 sec + 6,370,126,838 cycles # 2.692 GHz + 11,708,850,355 instructions # 1.84 insn per cycle + 2.367368593 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2674) (512y: 239) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 +Avg ME (F77/C++) = 2.0288063388516209 +Relative difference = 3.258803716446205e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.728872e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.941749e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.941749e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.672883e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.874095e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.874095e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.960781 sec -INFO: No Floating Point Exceptions have been reported - 5,563,913,804 cycles # 1.872 GHz - 9,412,295,126 instructions # 1.69 insn per cycle - 2.972906161 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2099) (512y: 282) (512z: 1958) +TOTAL : 2.962382 sec + 5,387,973,040 cycles # 1.816 GHz + 9,344,687,874 instructions # 1.73 insn per cycle + 2.967757912 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2107) (512y: 282) (512z: 1954) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 +Avg ME (F77/C++) = 2.0288063388516209 +Relative difference = 3.258803716446205e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 3c1647789f..1f895c929f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:28:57 +DATE: 2025-10-11_16:18:48 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.391002e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.323919e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.976474e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.773620e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.074692e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.456461e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.534058 sec -INFO: No Floating Point Exceptions have been reported - 2,225,875,951 cycles # 2.883 GHz - 3,143,824,990 instructions # 1.41 insn per cycle - 0.828954123 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 212 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.534811 sec + 2,266,123,133 cycles # 2.828 GHz + 3,168,944,538 instructions # 1.40 insn per cycle + 0.857996121 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.586147e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.682611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.682611e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.506524e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.597769e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.597769e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.200138 sec -INFO: No Floating Point Exceptions have been reported - 12,457,576,414 cycles # 2.958 GHz - 35,030,140,380 instructions # 2.81 insn per cycle - 4.211834896 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 430) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.291386 sec + 12,399,672,738 cycles # 2.887 GHz + 35,290,415,137 instructions # 2.85 insn per cycle + 4.296907910 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515654 +Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.003695e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.145378e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.145378e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.891328e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.022776e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.022776e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.637171 sec -INFO: No Floating Point Exceptions have been reported - 10,771,658,335 cycles # 2.953 GHz - 23,459,809,146 instructions # 2.18 insn per cycle - 3.648522280 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2378) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.735496 sec + 10,767,908,972 cycles # 2.879 GHz + 23,493,099,341 instructions # 2.18 insn per cycle + 3.741023923 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515654 -Relative difference = 3.2588039900609506e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.029039e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.423785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.423785e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.929407e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.312189e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.312189e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.231082 sec -INFO: No Floating Point Exceptions have been reported - 6,224,358,348 cycles # 2.777 GHz - 11,980,138,777 instructions # 1.92 insn per cycle - 2.242426635 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2468) (512y: 0) (512z: 0) +TOTAL : 2.235559 sec + 6,081,264,505 cycles # 2.715 GHz + 12,002,246,039 instructions # 1.97 insn per cycle + 2.240973571 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 +Avg ME (F77/C++) = 2.0288063388516209 +Relative difference = 3.258803716446205e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.044695e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.439218e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.439218e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.860705e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.225389e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.225389e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.224952 sec -INFO: No Floating Point Exceptions have been reported - 6,216,689,838 cycles # 2.781 GHz - 11,219,235,507 instructions # 1.80 insn per cycle - 2.236216110 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2098) (512y: 174) (512z: 0) +TOTAL : 2.264729 sec + 6,145,018,402 cycles # 2.708 GHz + 11,235,762,297 instructions # 1.83 insn per cycle + 2.270329967 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2110) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 +Avg ME (F77/C++) = 2.0288063388516209 +Relative difference = 3.258803716446205e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.888626e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.118349e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.118349e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.696752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.901055e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.901055e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.842958 sec -INFO: No Floating Point Exceptions have been reported - 5,376,391,405 cycles # 1.885 GHz - 9,136,626,879 instructions # 1.70 insn per cycle - 2.854254782 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1632) (512y: 208) (512z: 1567) +TOTAL : 2.944494 sec + 5,239,165,595 cycles # 1.777 GHz + 9,095,766,728 instructions # 1.74 insn per cycle + 2.949694561 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1638) (512y: 208) (512z: 1583) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 +Avg ME (F77/C++) = 2.0288063388516209 +Relative difference = 3.258803716446205e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling new file mode 100644 index 0000000000..70eb313ac9 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:41:21 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.475062e+06 1 256 +3.218486e+06 2 256 +5.903821e+06 4 256 +1.165716e+07 8 256 +2.454885e+07 16 256 +4.527393e+07 32 256 +8.391766e+07 64 256 +1.334550e+08 128 256 +1.552485e+08 256 256 +1.694983e+08 512 256 +1.849571e+08 1024 256 +### GPU: scaling test 32 +1.882231e+05 1 32 +4.016921e+05 2 32 +8.022815e+05 4 32 +1.595811e+06 8 32 +3.056260e+06 16 32 +6.326142e+06 32 32 +1.208794e+07 64 32 +2.463478e+07 128 32 +4.741756e+07 256 32 +9.093281e+07 512 32 +1.150905e+08 1024 32 +1.344888e+08 2048 32 +1.543860e+08 4096 32 +1.683918e+08 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.843216e+05 1 256 +1.897524e+05 2 256 +1.896027e+05 4 256 +### CPU: scaling test 32 +1.666589e+05 1 32 +1.669510e+05 2 32 +1.791277e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.321762e+05 1 256 +4.399797e+05 2 256 +4.577304e+05 4 256 +### CPU: scaling test 32 +4.375351e+05 1 32 +3.779245e+05 2 32 +4.181545e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.280541e+05 1 256 +9.070263e+05 2 256 +9.020254e+05 4 256 +### CPU: scaling test 32 +8.873360e+05 1 32 +9.140769e+05 2 32 +9.224693e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.444090e+05 1 256 +9.480587e+05 2 256 +9.506189e+05 4 256 +### CPU: scaling test 32 +9.250159e+05 1 32 +9.436188e+05 2 32 +9.553023e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.540106e+05 1 256 +6.620410e+05 2 256 +6.781399e+05 4 256 +### CPU: scaling test 32 +5.655809e+05 1 32 +5.425522e+05 2 32 +6.546076e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index eed598e900..29a4ea8877 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_08:59:31 +DATE: 2025-10-11_15:19:12 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.348925e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.730429e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.847126e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.227728e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.785385e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.924249e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.489368 sec -INFO: No Floating Point Exceptions have been reported - 2,066,464,716 cycles # 2.888 GHz - 2,966,218,976 instructions # 1.44 insn per cycle - 0.775358949 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.492304 sec + 2,118,504,146 cycles # 2.819 GHz + 2,963,870,047 instructions # 1.40 insn per cycle + 0.808747497 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.920704e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.976809e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.976809e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.880677e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.933319e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.933319e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.564608 sec -INFO: No Floating Point Exceptions have been reported - 16,407,008,301 cycles # 2.946 GHz - 45,390,324,197 instructions # 2.77 insn per cycle - 5.572247633 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 591) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.662756 sec + 16,361,560,744 cycles # 2.887 GHz + 45,526,236,392 instructions # 2.78 insn per cycle + 5.668346367 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288198669441044 Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.527362e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.867119e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.867119e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.414646e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.739659e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.739659e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.418751 sec -INFO: No Floating Point Exceptions have been reported - 7,148,582,676 cycles # 2.947 GHz - 17,841,430,692 instructions # 2.50 insn per cycle - 2.426747092 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.463879 sec + 7,092,934,877 cycles # 2.874 GHz + 17,852,493,922 instructions # 2.52 insn per cycle + 2.469325378 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193075684831 Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.351940e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.517580e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.517580e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.208525e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.313027e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.313027e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.358362 sec -INFO: No Floating Point Exceptions have been reported - 3,812,563,399 cycles # 2.792 GHz - 8,312,155,726 instructions # 2.18 insn per cycle - 1.366469053 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3367) (512y: 0) (512z: 0) +TOTAL : 1.365011 sec + 3,747,283,623 cycles # 2.735 GHz + 8,291,354,119 instructions # 2.21 insn per cycle + 1.370608034 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.799220e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.010674e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.010674e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.454543e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.612605e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.612605e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.295301 sec -INFO: No Floating Point Exceptions have been reported - 3,622,174,398 cycles # 2.781 GHz - 7,961,498,247 instructions # 2.20 insn per cycle - 1.303182368 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3209) (512y: 20) (512z: 0) +TOTAL : 1.327433 sec + 3,648,803,599 cycles # 2.739 GHz + 8,020,246,707 instructions # 2.20 insn per cycle + 1.332943592 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.500324e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.161825e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.161825e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.298741e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.918817e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.918817e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.717843 sec -INFO: No Floating Point Exceptions have been reported - 3,332,199,340 cycles # 1.933 GHz - 6,146,454,565 instructions # 1.84 insn per cycle - 1.725889754 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2251) (512y: 22) (512z: 2155) +TOTAL : 1.753154 sec + 3,282,016,345 cycles # 1.867 GHz + 6,088,962,733 instructions # 1.86 insn per cycle + 1.758605907 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183148950338 Relative difference = 1.5521108056421764e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..d76cec9169 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:56:13 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +4.541979e+05 1 256 +9.203949e+05 2 256 +1.645855e+06 4 256 +3.099419e+06 8 256 +4.823113e+06 16 256 +7.898172e+06 32 256 +1.061455e+07 64 256 +1.233940e+07 128 256 +1.359197e+07 256 256 +1.426011e+07 512 256 +1.471228e+07 1024 256 +### GPU: scaling test 32 +5.695876e+04 1 32 +1.092163e+05 2 32 +2.189134e+05 4 32 +4.543656e+05 8 32 +8.666538e+05 16 32 +1.664792e+06 32 32 +3.023066e+06 64 32 +5.156183e+06 128 32 +7.621691e+06 256 32 +1.049897e+07 512 32 +1.232012e+07 1024 32 +1.355710e+07 2048 32 +1.432425e+07 4096 32 +1.475276e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.747944e+05 1 256 +1.817829e+05 2 256 +1.896771e+05 4 256 +### CPU: scaling test 32 +1.728805e+05 1 32 +1.767946e+05 2 32 +1.762418e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.997246e+05 1 256 +4.307310e+05 2 256 +4.464263e+05 4 256 +### CPU: scaling test 32 +3.999600e+05 1 32 +3.699679e+05 2 32 +4.315766e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.797794e+05 1 256 +8.305580e+05 2 256 +8.419045e+05 4 256 +### CPU: scaling test 32 +8.881488e+05 1 32 +9.130727e+05 2 32 +9.232345e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.581879e+05 1 256 +9.512415e+05 2 256 +9.501003e+05 4 256 +### CPU: scaling test 32 +9.220574e+05 1 32 +9.420354e+05 2 32 +8.881180e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.495302e+05 1 256 +6.782481e+05 2 256 +6.868630e+05 4 256 +### CPU: scaling test 32 +5.595188e+05 1 32 +6.234779e+05 2 32 +6.548319e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt new file mode 100644 index 0000000000..e92eb3813b --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt @@ -0,0 +1,223 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:51:48 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.351930e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.489593e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.498993e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 1.246737 sec + 4,579,068,239 cycles # 2.831 GHz + 6,336,239,576 instructions # 1.38 insn per cycle + 1.674994938 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499532034621 +Relative difference = 1.920001590188648e-05 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.876691e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.929278e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.929278e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 5.673971 sec + 16,357,814,340 cycles # 2.881 GHz + 45,526,139,472 instructions # 2.78 insn per cycle + 5.679332523 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.428670e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.753669e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.753669e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.455440 sec + 7,090,910,684 cycles # 2.883 GHz + 17,852,546,600 instructions # 2.52 insn per cycle + 2.460806632 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.063338e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.125894e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.125894e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.386534 sec + 3,756,179,949 cycles # 2.700 GHz + 8,291,185,200 instructions # 2.21 insn per cycle + 1.391900760 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.396585e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.545366e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.545366e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.336868 sec + 3,642,317,678 cycles # 2.716 GHz + 8,019,205,916 instructions # 2.20 insn per cycle + 1.344058514 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.310834e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.934764e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.934764e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.748608 sec + 3,284,552,833 cycles # 1.874 GHz + 6,088,622,803 instructions # 1.85 insn per cycle + 1.753990283 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index ba391daf9b..3e1eb5adfb 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,252 +10,216 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:38:02 +DATE: 2025-10-11_16:29:11 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.962971e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.366502e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.366502e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.961069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.550509e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.550509e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.683449 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,639,955,466 cycles # 2.881 GHz - 4,089,465,491 instructions # 1.55 insn per cycle - 0.973820402 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +TOTAL : 0.685895 sec + 2,724,461,027 cycles # 2.849 GHz + 4,115,491,673 instructions # 1.51 insn per cycle + 1.013379386 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.921187e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.975107e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.975107e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.879765e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.932625e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932625e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.590872 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 16,505,827,538 cycles # 2.949 GHz - 45,383,324,587 instructions # 2.75 insn per cycle - 5.597525299 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 591) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.709270 sec + 16,545,315,698 cycles # 2.895 GHz + 45,565,469,143 instructions # 2.75 insn per cycle + 5.715931822 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288198669441044 Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.503675e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.835801e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.835801e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.377287e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.696132e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.696132e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.463825 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 7,301,370,898 cycles # 2.956 GHz - 18,072,803,019 instructions # 2.48 insn per cycle - 2.471007950 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.532029 sec + 7,290,698,661 cycles # 2.873 GHz + 18,128,482,182 instructions # 2.49 insn per cycle + 2.538964767 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193075684831 Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.228346e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.356902e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.356902e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.010327e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.072284e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.072284e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.409585 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 3,950,274,134 cycles # 2.790 GHz - 8,500,615,795 instructions # 2.15 insn per cycle - 1.416669722 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3367) (512y: 0) (512z: 0) +TOTAL : 1.445098 sec + 3,968,422,684 cycles # 2.734 GHz + 8,524,408,845 instructions # 2.15 insn per cycle + 1.452187655 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.630316e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.908478e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.908478e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.285117e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.425187e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.425187e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.350838 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 3,776,468,219 cycles # 2.783 GHz - 8,150,432,975 instructions # 2.16 insn per cycle - 1.357973048 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3209) (512y: 20) (512z: 0) +TOTAL : 1.403001 sec + 3,860,651,396 cycles # 2.740 GHz + 8,252,993,133 instructions # 2.14 insn per cycle + 1.409829697 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.446924e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.088794e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.088794e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.256834e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.869079e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.869079e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.766906 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 3,483,580,907 cycles # 1.964 GHz - 6,352,443,418 instructions # 1.82 insn per cycle - 1.774118995 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2251) (512y: 22) (512z: 2155) +TOTAL : 1.813530 sec + 3,488,089,376 cycles # 1.917 GHz + 6,339,016,347 instructions # 1.82 insn per cycle + 1.820470769 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183148950338 Relative difference = 1.5521108056421764e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index eaf1557b5a..001fd1b5e8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:50:12 +DATE: 2025-10-11_16:44:30 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.125576e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.707303e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.828418e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.384623e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.781787e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.923075e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.579716 sec -INFO: No Floating Point Exceptions have been reported - 2,336,853,883 cycles # 2.860 GHz - 3,355,823,518 instructions # 1.44 insn per cycle - 0.873538557 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.586690 sec + 2,388,718,169 cycles # 2.838 GHz + 3,423,003,931 instructions # 1.43 insn per cycle + 0.899326702 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.929027e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.983438e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.983438e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.880714e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934194e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934194e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 5.578220 sec -INFO: No Floating Point Exceptions have been reported - 16,412,792,219 cycles # 2.940 GHz - 45,364,108,775 instructions # 2.76 insn per cycle - 5.583854256 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 591) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.720004 sec + 16,536,660,388 cycles # 2.889 GHz + 45,556,960,525 instructions # 2.75 insn per cycle + 5.725324950 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288198669441044 Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.528116e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.863028e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.863028e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.433465e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.759989e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.759989e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.458830 sec -INFO: No Floating Point Exceptions have been reported - 7,256,357,914 cycles # 2.945 GHz - 17,803,442,746 instructions # 2.45 insn per cycle - 2.464565338 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.509292 sec + 7,256,957,374 cycles # 2.887 GHz + 17,864,987,256 instructions # 2.46 insn per cycle + 2.514536012 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193075684831 Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.321630e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.466483e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.466483e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.020309e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.092138e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.092138e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.402951 sec -INFO: No Floating Point Exceptions have been reported - 3,915,341,003 cycles # 2.781 GHz - 8,245,891,296 instructions # 2.11 insn per cycle - 1.408611815 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3367) (512y: 0) (512z: 0) +TOTAL : 1.453461 sec + 3,918,315,703 cycles # 2.689 GHz + 8,275,994,533 instructions # 2.11 insn per cycle + 1.458689528 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.769699e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.005525e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.005525e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.428992e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.604343e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.604343e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.339268 sec -INFO: No Floating Point Exceptions have been reported - 3,730,447,512 cycles # 2.775 GHz - 7,861,984,465 instructions # 2.11 insn per cycle - 1.344998375 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3209) (512y: 20) (512z: 0) +TOTAL : 1.389726 sec + 3,813,398,977 cycles # 2.735 GHz + 7,970,393,641 instructions # 2.09 insn per cycle + 1.395086187 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.517692e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.188107e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.188107e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.306240e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.928204e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.928204e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.753383 sec -INFO: No Floating Point Exceptions have been reported - 3,445,483,739 cycles # 1.959 GHz - 6,046,658,237 instructions # 1.75 insn per cycle - 1.759146158 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2251) (512y: 22) (512z: 2155) +TOTAL : 1.809723 sec + 3,457,472,821 cycles # 1.906 GHz + 6,039,803,289 instructions # 1.75 insn per cycle + 1.815214301 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183148950338 Relative difference = 1.5521108056421764e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 0132142a7f..d6dd5599d5 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:47:21 +DATE: 2025-10-11_16:40:59 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.231900e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.718618e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.843172e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.173088e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.784679e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922376e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.520956 sec -INFO: No Floating Point Exceptions have been reported - 2,145,908,279 cycles # 2.880 GHz - 3,342,720,192 instructions # 1.56 insn per cycle - 0.802555619 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.528664 sec + 2,228,192,580 cycles # 2.835 GHz + 3,376,529,061 instructions # 1.52 insn per cycle + 0.842332325 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.929666e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.983661e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.983661e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.871432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.923569e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.923569e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.518695 sec -INFO: No Floating Point Exceptions have been reported - 16,237,309,072 cycles # 2.940 GHz - 45,332,194,999 instructions # 2.79 insn per cycle - 5.524338903 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 591) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.691172 sec + 16,369,213,744 cycles # 2.874 GHz + 45,526,750,504 instructions # 2.78 insn per cycle + 5.696402221 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288198669441044 Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.531812e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.871745e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.871745e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.441693e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.769480e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.769480e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.400101 sec -INFO: No Floating Point Exceptions have been reported - 7,092,917,063 cycles # 2.949 GHz - 17,790,950,300 instructions # 2.51 insn per cycle - 2.405895056 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.448890 sec + 7,093,051,214 cycles # 2.891 GHz + 17,852,960,067 instructions # 2.52 insn per cycle + 2.454461827 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193075684831 Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.364764e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.520513e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.520513e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.163467e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.249025e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.249025e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.339869 sec -INFO: No Floating Point Exceptions have been reported - 3,746,789,760 cycles # 2.786 GHz - 8,261,610,745 instructions # 2.20 insn per cycle - 1.345882215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3367) (512y: 0) (512z: 0) +TOTAL : 1.371747 sec + 3,753,987,891 cycles # 2.728 GHz + 8,291,362,993 instructions # 2.21 insn per cycle + 1.377043835 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.818621e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.013746e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.013746e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.404785e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.570601e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.570601e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.275913 sec -INFO: No Floating Point Exceptions have been reported - 3,561,649,230 cycles # 2.781 GHz - 7,911,264,889 instructions # 2.22 insn per cycle - 1.281614236 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3209) (512y: 20) (512z: 0) +TOTAL : 1.335938 sec + 3,649,997,495 cycles # 2.722 GHz + 8,019,382,433 instructions # 2.20 insn per cycle + 1.341456805 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.490214e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.139560e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.139560e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.228574e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.840288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.840288e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.701983 sec -INFO: No Floating Point Exceptions have been reported - 3,270,370,699 cycles # 1.916 GHz - 6,096,029,839 instructions # 1.86 insn per cycle - 1.707817189 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2251) (512y: 22) (512z: 2155) +TOTAL : 1.772330 sec + 3,277,054,131 cycles # 1.844 GHz + 6,089,082,639 instructions # 1.86 insn per cycle + 1.777760056 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183148950338 Relative difference = 1.5521108056421764e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt new file mode 100644 index 0000000000..0ad3efbc84 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt @@ -0,0 +1,223 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasNoBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasNoBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:50:09 + +HASBLAS=hasNoBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.507701e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.798145e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925897e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.495248 sec + 2,073,360,534 cycles # 2.817 GHz + 2,919,069,837 instructions # 1.41 insn per cycle + 0.794188547 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.871656e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.924156e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.924156e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 5.690466 sec + 16,392,687,892 cycles # 2.879 GHz + 45,529,529,055 instructions # 2.78 insn per cycle + 5.695668537 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.439601e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.767131e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.767131e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.449797 sec + 7,091,941,326 cycles # 2.890 GHz + 17,852,858,856 instructions # 2.52 insn per cycle + 2.455296966 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.145431e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.245108e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.245108e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.374709 sec + 3,766,055,040 cycles # 2.731 GHz + 8,291,749,848 instructions # 2.20 insn per cycle + 1.380351643 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.422664e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.588896e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.588896e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.332190 sec + 3,646,916,248 cycles # 2.728 GHz + 8,019,155,847 instructions # 2.20 insn per cycle + 1.337783089 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.310342e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.933915e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.933915e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.749833 sec + 3,289,282,662 cycles # 1.875 GHz + 6,089,226,401 instructions # 1.85 insn per cycle + 1.755424623 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 55c92f68ec..0d4e6e9f4e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,235 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:44:35 +DATE: 2025-10-11_16:37:35 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.418560e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.722658e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.839243e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.371325e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.785294e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.923320e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.630221 sec -INFO: No Floating Point Exceptions have been reported - 2,475,236,721 cycles # 2.897 GHz - 3,823,734,565 instructions # 1.54 insn per cycle - 0.911361538 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +TOTAL : 0.635131 sec + 2,535,737,467 cycles # 2.824 GHz + 3,842,575,439 instructions # 1.52 insn per cycle + 0.954476643 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.933112e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.987540e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.987540e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.876671e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.930263e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.930263e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.510619 sec -INFO: No Floating Point Exceptions have been reported - 16,239,692,933 cycles # 2.945 GHz - 45,332,021,728 instructions # 2.79 insn per cycle - 5.516250908 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 591) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.674874 sec + 16,371,341,972 cycles # 2.883 GHz + 45,526,097,275 instructions # 2.78 insn per cycle + 5.680145436 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288198669441044 Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.528380e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.868469e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.868469e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.409852e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.733764e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.733764e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.402631 sec -INFO: No Floating Point Exceptions have been reported - 7,087,618,340 cycles # 2.944 GHz - 17,790,727,043 instructions # 2.51 insn per cycle - 2.408346877 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.465466 sec + 7,089,429,077 cycles # 2.870 GHz + 17,852,779,482 instructions # 2.52 insn per cycle + 2.470998970 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193075684831 Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.367783e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.536121e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.536121e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.159709e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.263116e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.263116e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.339197 sec -INFO: No Floating Point Exceptions have been reported - 3,748,433,186 cycles # 2.789 GHz - 8,262,218,774 instructions # 2.20 insn per cycle - 1.344812605 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3367) (512y: 0) (512z: 0) +TOTAL : 1.372303 sec + 3,755,689,027 cycles # 2.728 GHz + 8,291,380,091 instructions # 2.21 insn per cycle + 1.377787541 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.816225e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.011910e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.011910e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.407094e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.566877e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.566877e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.274973 sec -INFO: No Floating Point Exceptions have been reported - 3,561,414,995 cycles # 2.782 GHz - 7,912,015,045 instructions # 2.22 insn per cycle - 1.280637958 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3209) (512y: 20) (512z: 0) +TOTAL : 1.334826 sec + 3,652,466,006 cycles # 2.727 GHz + 8,020,599,017 instructions # 2.20 insn per cycle + 1.340268045 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.504790e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.157762e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.157762e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.261859e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.880005e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.880005e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.699252 sec -INFO: No Floating Point Exceptions have been reported - 3,270,672,138 cycles # 1.919 GHz - 6,095,863,693 instructions # 1.86 insn per cycle - 1.704973507 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2251) (512y: 22) (512z: 2155) +TOTAL : 1.763075 sec + 3,282,506,046 cycles # 1.857 GHz + 6,088,973,421 instructions # 1.85 insn per cycle + 1.768455658 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183148950338 Relative difference = 1.5521108056421764e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 5e80ecf473..e0e7f701d0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_08:59:52 +DATE: 2025-10-11_15:19:36 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.326131e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.746336e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.856838e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.162146e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.783523e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914919e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.493394 sec -INFO: No Floating Point Exceptions have been reported - 2,062,281,894 cycles # 2.861 GHz - 2,938,913,241 instructions # 1.43 insn per cycle - 0.784913836 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 126 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.491426 sec + 2,125,746,364 cycles # 2.830 GHz + 2,979,109,571 instructions # 1.40 insn per cycle + 0.808584273 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.953822e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.011638e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.011638e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.921360e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.976251e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.976251e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.471139 sec -INFO: No Floating Point Exceptions have been reported - 16,020,529,034 cycles # 2.925 GHz - 44,492,038,074 instructions # 2.78 insn per cycle - 5.480388445 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 536) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.544826 sec + 16,047,528,517 cycles # 2.892 GHz + 44,602,173,132 instructions # 2.78 insn per cycle + 5.550245916 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 537) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288198669441044 Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.317220e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.788673e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.788673e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.214945e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.668104e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.668104e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.075008 sec -INFO: No Floating Point Exceptions have been reported - 6,135,177,420 cycles # 2.947 GHz - 17,131,917,948 instructions # 2.79 insn per cycle - 2.082995277 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2863) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.098377 sec + 6,110,919,161 cycles # 2.906 GHz + 17,150,206,958 instructions # 2.81 insn per cycle + 2.103751937 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2861) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193075684831 Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.077036e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.672972e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.672972e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.851382e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.388872e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.388872e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.827961 sec -INFO: No Floating Point Exceptions have been reported - 5,098,745,585 cycles # 2.778 GHz - 10,277,927,063 instructions # 2.02 insn per cycle - 1.836088116 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3907) (512y: 0) (512z: 0) +TOTAL : 1.879565 sec + 5,032,467,533 cycles # 2.672 GHz + 10,256,120,490 instructions # 2.04 insn per cycle + 1.885016732 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3911) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.138089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.753320e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.753320e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.035975e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.607599e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.607599e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.811229 sec -INFO: No Floating Point Exceptions have been reported - 5,047,478,028 cycles # 2.778 GHz - 10,048,355,032 instructions # 1.99 insn per cycle - 1.819572790 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3806) (512y: 2) (512z: 0) +TOTAL : 1.824491 sec + 4,977,961,454 cycles # 2.721 GHz + 10,027,255,295 instructions # 2.01 insn per cycle + 1.830117525 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3808) (512y: 2) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.690006e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.022722e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.022722e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.496582e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.807885e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.807885e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.339710 sec -INFO: No Floating Point Exceptions have been reported - 4,430,484,038 cycles # 1.888 GHz - 8,494,687,635 instructions # 1.92 insn per cycle - 2.347901015 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2746) (512y: 4) (512z: 2754) +TOTAL : 2.420813 sec + 4,388,139,749 cycles # 1.809 GHz + 8,457,918,888 instructions # 1.93 insn per cycle + 2.426523884 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2749) (512y: 4) (512z: 2749) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183148950338 Relative difference = 1.5521108056421764e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 8666f655aa..f0b80e260e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:29:20 +DATE: 2025-10-11_16:19:19 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.502979e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.757241e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.878370e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.131628e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.790004e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927316e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.488684 sec -INFO: No Floating Point Exceptions have been reported - 2,072,092,086 cycles # 2.888 GHz - 2,980,809,123 instructions # 1.44 insn per cycle - 0.774128701 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.492105 sec + 2,126,004,887 cycles # 2.830 GHz + 2,972,871,951 instructions # 1.40 insn per cycle + 0.808125336 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.497944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.591831e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.591831e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.361435e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.444812e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.444812e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.301034 sec -INFO: No Floating Point Exceptions have been reported - 12,652,758,977 cycles # 2.937 GHz - 34,660,886,060 instructions # 2.74 insn per cycle - 4.309086604 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 683) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.526570 sec + 12,786,889,749 cycles # 2.822 GHz + 34,767,168,341 instructions # 2.72 insn per cycle + 4.531843724 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 649) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199094356969 -Relative difference = 4.463890496342449e-08 +Avg ME (F77/C++) = 2.0288198597263545 +Relative difference = 6.914050807267083e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.170038e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.622090e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.622090e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.142214e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.587894e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.587894e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.133248 sec -INFO: No Floating Point Exceptions have been reported - 6,307,478,134 cycles # 2.947 GHz - 14,873,781,997 instructions # 2.36 insn per cycle - 2.140857047 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2975) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.126971 sec + 6,176,687,935 cycles # 2.898 GHz + 14,909,588,070 instructions # 2.41 insn per cycle + 2.132251600 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2978) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193755550310 -Relative difference = 1.8511017053446366e-07 +Avg ME (F77/C++) = 2.0288193110609427 +Relative difference = 1.5332118970762702e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.248492e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.104502e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.104502e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.053580e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.852260e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.852260e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.548692 sec -INFO: No Floating Point Exceptions have been reported - 4,331,332,767 cycles # 2.784 GHz - 9,119,017,787 instructions # 2.11 insn per cycle - 1.556682967 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4456) (512y: 0) (512z: 0) +TOTAL : 1.573119 sec + 4,286,494,919 cycles # 2.717 GHz + 9,134,727,561 instructions # 2.13 insn per cycle + 1.578532938 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4466) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182069780305 -Relative difference = 1.0201902325125583e-07 +Avg ME (F77/C++) = 2.0288181575015187 +Relative difference = 7.763215770863579e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.353371e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.251881e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.251881e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.155196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.974374e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.974374e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.529166 sec -INFO: No Floating Point Exceptions have been reported - 4,288,032,705 cycles # 2.791 GHz - 8,709,611,506 instructions # 2.03 insn per cycle - 1.537124060 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4233) (512y: 0) (512z: 0) +TOTAL : 1.552673 sec + 4,257,884,690 cycles # 2.734 GHz + 8,700,271,049 instructions # 2.04 insn per cycle + 1.558196136 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4224) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182069780305 -Relative difference = 1.0201902325125583e-07 +Avg ME (F77/C++) = 2.0288181575015187 +Relative difference = 7.763215770863579e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.411255e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.862053e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.862053e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.246960e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.671205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.671205e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.041395 sec -INFO: No Floating Point Exceptions have been reported - 3,904,121,018 cycles # 1.906 GHz - 7,856,412,999 instructions # 2.01 insn per cycle - 2.049301951 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4273) (512y: 0) (512z: 2558) +TOTAL : 2.085797 sec + 3,847,204,769 cycles # 1.841 GHz + 7,838,410,301 instructions # 2.04 insn per cycle + 2.091150296 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4276) (512y: 0) (512z: 2561) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183246739209 -Relative difference = 1.6003107281264138e-07 +Avg ME (F77/C++) = 2.0288182856747881 +Relative difference = 1.4080848467904676e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 74b1cf75ec..26b7d791d0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:29:40 +DATE: 2025-10-11_16:19:42 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.573239e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.755917e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.881516e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.156027e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.795194e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.935274e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.487451 sec -INFO: No Floating Point Exceptions have been reported - 2,067,657,057 cycles # 2.894 GHz - 2,969,147,079 instructions # 1.44 insn per cycle - 0.771604792 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 126 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.491299 sec + 2,134,224,720 cycles # 2.818 GHz + 2,993,931,932 instructions # 1.40 insn per cycle + 0.814346515 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.674902e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.781976e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.781976e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.565640e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.664688e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.664688e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.022349 sec -INFO: No Floating Point Exceptions have been reported - 11,884,847,246 cycles # 2.950 GHz - 35,128,022,846 instructions # 2.96 insn per cycle - 4.030241157 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 453) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.173683 sec + 11,879,331,181 cycles # 2.844 GHz + 35,236,712,439 instructions # 2.97 insn per cycle + 4.178908664 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199094356969 -Relative difference = 4.463890496342449e-08 +Avg ME (F77/C++) = 2.0288198597263545 +Relative difference = 6.914050807267083e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.473588e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.982990e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.982990e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.266171e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.744141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.744141e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.018275 sec -INFO: No Floating Point Exceptions have been reported - 5,977,087,994 cycles # 2.951 GHz - 14,582,659,278 instructions # 2.44 insn per cycle - 2.026172081 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2569) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.079083 sec + 5,991,903,430 cycles # 2.877 GHz + 14,602,254,330 instructions # 2.44 insn per cycle + 2.084327795 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193583255634 -Relative difference = 1.7661780742548925e-07 +Avg ME (F77/C++) = 2.0288193158339709 +Relative difference = 1.5567380381214021e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.377553e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.279187e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.279187e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.207154e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.042682e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.042682e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.524553 sec -INFO: No Floating Point Exceptions have been reported - 4,234,763,555 cycles # 2.764 GHz - 8,897,798,804 instructions # 2.10 insn per cycle - 1.532761317 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3552) (512y: 0) (512z: 0) +TOTAL : 1.541810 sec + 4,186,740,965 cycles # 2.708 GHz + 8,926,188,902 instructions # 2.13 insn per cycle + 1.547085242 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3572) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +Avg ME (F77/C++) = 2.0288181557552889 +Relative difference = 7.677144480713156e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.495273e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.420338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.420338e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.102028e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.913223e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.913223e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.502506 sec -INFO: No Floating Point Exceptions have been reported - 4,214,392,060 cycles # 2.792 GHz - 8,461,762,117 instructions # 2.01 insn per cycle - 1.510417354 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3296) (512y: 0) (512z: 0) +TOTAL : 1.563681 sec + 4,235,267,452 cycles # 2.701 GHz + 8,456,560,522 instructions # 2.00 insn per cycle + 1.569074089 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3298) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +Avg ME (F77/C++) = 2.0288181557552889 +Relative difference = 7.677144480713156e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.487070e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.949626e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.949626e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.304407e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.741587e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.741587e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.014420 sec -INFO: No Floating Point Exceptions have been reported - 3,856,759,695 cycles # 1.908 GHz - 7,749,847,516 instructions # 2.01 insn per cycle - 2.022398856 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3289) (512y: 0) (512z: 2110) +TOTAL : 2.064360 sec + 3,788,747,014 cycles # 1.832 GHz + 7,722,840,376 instructions # 2.04 insn per cycle + 2.069669389 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3288) (512y: 0) (512z: 2115) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183204829693 -Relative difference = 1.5796536184903122e-07 +Avg ME (F77/C++) = 2.0288182756630704 +Relative difference = 1.3587373071042248e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling new file mode 100644 index 0000000000..54ccd09765 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:41:00 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.555626e+06 1 256 +2.986119e+06 2 256 +6.036846e+06 4 256 +1.188714e+07 8 256 +2.177797e+07 16 256 +4.206332e+07 32 256 +5.661642e+07 64 256 +6.199098e+07 128 256 +6.763415e+07 256 256 +7.331358e+07 512 256 +7.450922e+07 1024 256 +### GPU: scaling test 32 +1.688262e+05 1 32 +3.674276e+05 2 32 +6.877986e+05 4 32 +1.577034e+06 8 32 +2.900718e+06 16 32 +6.084626e+06 32 32 +1.103805e+07 64 32 +2.304347e+07 128 32 +4.366714e+07 256 32 +5.801104e+07 512 32 +6.280270e+07 1024 32 +6.781899e+07 2048 32 +7.247457e+07 4096 32 +7.443838e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.683557e+05 1 256 +1.766666e+05 2 256 +1.772916e+05 4 256 +### CPU: scaling test 32 +1.624761e+05 1 32 +1.667961e+05 2 32 +1.691810e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.045208e+05 1 256 +3.168070e+05 2 256 +3.217376e+05 4 256 +### CPU: scaling test 32 +2.400438e+05 1 32 +2.988113e+05 2 32 +3.019623e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.679979e+05 1 256 +5.383388e+05 2 256 +5.290511e+05 4 256 +### CPU: scaling test 32 +4.501210e+05 1 32 +5.408786e+05 2 32 +5.212787e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.337937e+05 1 256 +5.659660e+05 2 256 +5.616905e+05 4 256 +### CPU: scaling test 32 +5.554591e+05 1 32 +5.687726e+05 2 32 +5.722998e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.669688e+05 1 256 +3.628236e+05 2 256 +3.574239e+05 4 256 +### CPU: scaling test 32 +3.591712e+05 1 32 +3.436223e+05 2 32 +3.302689e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 46bc87b45e..544d45db6c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_08:58:41 +DATE: 2025-10-11_15:18:10 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.456560e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.379988e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.000705e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.769964e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.181272e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.572183e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.534501 sec -INFO: No Floating Point Exceptions have been reported - 2,219,584,721 cycles # 2.878 GHz - 3,138,987,562 instructions # 1.41 insn per cycle - 0.829330920 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.539441 sec + 2,308,666,493 cycles # 2.818 GHz + 3,226,425,933 instructions # 1.40 insn per cycle + 0.876647709 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 +Avg ME (F77/GPU) = 2.0288063984103686 +Relative difference = 2.9652383466921405e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.813220e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.859845e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.859845e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.759806e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.804204e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.804204e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.928691 sec -INFO: No Floating Point Exceptions have been reported - 17,514,594,449 cycles # 2.949 GHz - 46,201,641,620 instructions # 2.64 insn per cycle - 5.940965337 seconds time elapsed +TOTAL : 6.067261 sec + 17,454,635,732 cycles # 2.875 GHz + 46,423,626,762 instructions # 2.66 insn per cycle + 6.073054725 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.229159e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.395479e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.395479e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.147663e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.305031e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.305031e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.394540 sec -INFO: No Floating Point Exceptions have been reported - 10,052,901,757 cycles # 2.953 GHz - 27,702,324,481 instructions # 2.76 insn per cycle - 3.406321535 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.441893 sec + 9,972,963,833 cycles # 2.894 GHz + 27,538,315,448 instructions # 2.76 insn per cycle + 3.447650533 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.062332e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.465524e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.465524e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.024399e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.421447e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.421447e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.217135 sec -INFO: No Floating Point Exceptions have been reported - 6,171,509,914 cycles # 2.770 GHz - 12,603,170,569 instructions # 2.04 insn per cycle - 2.229995554 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2773) (512y: 0) (512z: 0) +TOTAL : 2.195598 sec + 6,002,435,023 cycles # 2.728 GHz + 12,431,827,184 instructions # 2.07 insn per cycle + 2.201348309 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.580384e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.068896e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.068896e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.239682e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.660399e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.660399e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.025421 sec -INFO: No Floating Point Exceptions have been reported - 5,651,741,681 cycles # 2.776 GHz - 12,038,443,177 instructions # 2.13 insn per cycle - 2.038138408 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2518) (512y: 146) (512z: 0) +TOTAL : 2.110434 sec + 5,712,484,983 cycles # 2.700 GHz + 11,998,977,462 instructions # 2.10 insn per cycle + 2.116158863 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.630973e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.831034e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.831034e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.500878e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.684605e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.684605e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.034632 sec -INFO: No Floating Point Exceptions have been reported - 5,740,712,408 cycles # 1.885 GHz - 8,225,599,297 instructions # 1.43 insn per cycle - 3.047056631 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1671) (512y: 126) (512z: 1862) +TOTAL : 3.104242 sec + 5,600,150,554 cycles # 1.801 GHz + 7,978,262,251 instructions # 1.42 insn per cycle + 3.109987032 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..108784d281 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:55:32 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +3.842927e+05 1 256 +7.220512e+05 2 256 +1.491222e+06 4 256 +2.667848e+06 8 256 +4.492588e+06 16 256 +7.139826e+06 32 256 +9.157999e+06 64 256 +1.073484e+07 128 256 +1.179428e+07 256 256 +1.249669e+07 512 256 +1.288538e+07 1024 256 +### GPU: scaling test 32 +4.771078e+04 1 32 +9.904224e+04 2 32 +1.834573e+05 4 32 +3.665684e+05 8 32 +7.223823e+05 16 32 +1.469468e+06 32 32 +2.777699e+06 64 32 +4.610551e+06 128 32 +7.035262e+06 256 32 +9.216118e+06 512 32 +1.072571e+07 1024 32 +1.171381e+07 2048 32 +1.244431e+07 4096 32 +1.273882e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.731213e+05 1 256 +1.728516e+05 2 256 +1.721045e+05 4 256 +### CPU: scaling test 32 +1.615729e+05 1 32 +1.697199e+05 2 32 +1.614079e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.020824e+05 1 256 +3.069129e+05 2 256 +3.229135e+05 4 256 +### CPU: scaling test 32 +3.068132e+05 1 32 +3.048781e+05 2 32 +3.056454e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.343999e+05 1 256 +5.367208e+05 2 256 +5.297172e+05 4 256 +### CPU: scaling test 32 +5.308120e+05 1 32 +5.388158e+05 2 32 +5.419802e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.825073e+05 1 256 +5.664394e+05 2 256 +5.715909e+05 4 256 +### CPU: scaling test 32 +5.596656e+05 1 32 +5.686160e+05 2 32 +5.559851e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.589260e+05 1 256 +3.525435e+05 2 256 +3.573650e+05 4 256 +### CPU: scaling test 32 +3.610027e+05 1 32 +3.443008e+05 2 32 +3.569646e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt new file mode 100644 index 0000000000..7312e696ce --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt @@ -0,0 +1,223 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:51:10 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.104417e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.285432e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.297689e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.279377 sec + 4,758,540,406 cycles # 2.854 GHz + 6,643,646,071 instructions # 1.40 insn per cycle + 1.727175074 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028807e+00 +Avg ME (F77/GPU) = 2.0288064033535846 +Relative difference = 2.940873209649997e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.760176e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.804148e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.804148e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 6.064955 sec + 17,456,010,031 cycles # 2.876 GHz + 46,423,917,890 instructions # 2.66 insn per cycle + 6.070556221 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.112364e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.267713e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.267713e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.477891 sec + 9,968,942,008 cycles # 2.863 GHz + 27,538,128,939 instructions # 2.76 insn per cycle + 3.483544020 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.028981e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.424760e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.424760e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.192400 sec + 5,973,164,521 cycles # 2.719 GHz + 12,431,134,039 instructions # 2.08 insn per cycle + 2.197968192 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.257840e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.686842e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.686842e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.101990 sec + 5,696,565,349 cycles # 2.704 GHz + 11,998,610,945 instructions # 2.11 insn per cycle + 2.107441314 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.469903e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.652910e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.652910e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.130516 sec + 5,582,204,405 cycles # 1.781 GHz + 7,977,597,583 instructions # 1.43 insn per cycle + 3.135909354 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt new file mode 100644 index 0000000000..a27304f7a2 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt @@ -0,0 +1,223 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasNoBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasNoBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:49:40 + +HASBLAS=hasNoBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.756606e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.155088e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.561577e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.537651 sec + 2,186,941,067 cycles # 2.809 GHz + 3,125,534,216 instructions # 1.43 insn per cycle + 0.834390897 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028807e+00 +Avg ME (F77/GPU) = 2.0288063984103686 +Relative difference = 2.9652383466921405e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.767944e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.812249e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.812249e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 6.039437 sec + 17,472,986,286 cycles # 2.891 GHz + 46,424,951,460 instructions # 2.66 insn per cycle + 6.045113130 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.115406e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.269058e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.269058e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.475319 sec + 9,963,493,199 cycles # 2.863 GHz + 27,538,476,105 instructions # 2.76 insn per cycle + 3.481071152 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.946610e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.336487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.336487e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.229478 sec + 5,990,602,521 cycles # 2.681 GHz + 12,432,421,413 instructions # 2.08 insn per cycle + 2.235415428 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.285571e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.719782e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.719782e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.092266 sec + 5,708,527,225 cycles # 2.722 GHz + 11,999,256,931 instructions # 2.10 insn per cycle + 2.098089382 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.527493e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.713588e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.713588e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.081621 sec + 5,593,729,597 cycles # 1.813 GHz + 7,978,349,260 instructions # 1.43 insn per cycle + 3.087480023 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index ffa5410982..1465355626 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_08:59:06 +DATE: 2025-10-11_15:18:40 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.422071e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.351796e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.985674e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.777084e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.077254e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.446466e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.535641 sec -INFO: No Floating Point Exceptions have been reported - 2,214,747,611 cycles # 2.879 GHz - 3,172,033,471 instructions # 1.43 insn per cycle - 0.829540839 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 212 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.540754 sec + 2,303,579,994 cycles # 2.845 GHz + 3,194,596,199 instructions # 1.39 insn per cycle + 0.867263238 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 +Avg ME (F77/GPU) = 2.0288063984103686 +Relative difference = 2.9652383466921405e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.862163e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.911340e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.911340e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.824688e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.871754e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.871754e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.777703 sec -INFO: No Floating Point Exceptions have been reported - 17,097,861,095 cycles # 2.954 GHz - 45,230,787,591 instructions # 2.65 insn per cycle - 5.789414615 seconds time elapsed +TOTAL : 5.855357 sec + 17,037,217,478 cycles # 2.907 GHz + 45,397,533,623 instructions # 2.66 insn per cycle + 5.861206077 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 568) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.356972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.536408e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.536408e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.237044e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.404010e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.404010e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.270231 sec -INFO: No Floating Point Exceptions have been reported - 9,665,855,757 cycles # 2.946 GHz - 26,370,377,514 instructions # 2.73 insn per cycle - 3.281726897 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2386) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.349468 sec + 9,646,439,674 cycles # 2.877 GHz + 26,137,505,372 instructions # 2.71 insn per cycle + 3.359990731 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2348) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.515319e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.832036e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.832036e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.466137e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.774981e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.774981e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.470441 sec -INFO: No Floating Point Exceptions have been reported - 6,884,599,220 cycles # 2.774 GHz - 14,150,233,239 instructions # 2.06 insn per cycle - 2.482504065 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2896) (512y: 0) (512z: 0) +TOTAL : 2.456437 sec + 6,697,050,662 cycles # 2.721 GHz + 13,944,204,689 instructions # 2.08 insn per cycle + 2.462051029 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2872) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.744762e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.096792e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.096792e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.691262e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.027361e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.027361e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.356796 sec -INFO: No Floating Point Exceptions have been reported - 6,551,408,744 cycles # 2.767 GHz - 13,642,717,150 instructions # 2.08 insn per cycle - 2.368190066 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2535) (512y: 302) (512z: 0) +TOTAL : 2.343988 sec + 6,390,605,834 cycles # 2.721 GHz + 13,479,985,492 instructions # 2.11 insn per cycle + 2.349738024 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2521) (512y: 302) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.568399e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.763148e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.763148e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.551855e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.739422e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.739422e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.086761 sec -INFO: No Floating Point Exceptions have been reported - 5,741,113,391 cycles # 1.854 GHz - 9,326,512,235 instructions # 1.62 insn per cycle - 3.098253222 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1456) (512y: 212) (512z: 2060) +TOTAL : 3.060308 sec + 5,571,902,780 cycles # 1.818 GHz + 9,121,747,396 instructions # 1.64 insn per cycle + 3.066113600 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1425) (512y: 212) (512z: 2028) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling new file mode 100644 index 0000000000..13f478253e --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:41:41 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +9.342009e+05 1 256 +1.901727e+06 2 256 +3.513575e+06 4 256 +6.551587e+06 8 256 +9.027157e+06 16 256 +1.070472e+07 32 256 +1.211534e+07 64 256 +1.306873e+07 128 256 +1.345611e+07 256 256 +1.354148e+07 512 256 +1.365009e+07 1024 256 +### GPU: scaling test 32 +1.205755e+05 1 32 +2.514606e+05 2 32 +5.001172e+05 4 32 +9.511001e+05 8 32 +1.851142e+06 16 32 +3.545547e+06 32 32 +6.694933e+06 64 32 +9.515800e+06 128 32 +1.033055e+07 256 32 +1.109138e+07 512 32 +1.156765e+07 1024 32 +1.192504e+07 2048 32 +1.207986e+07 4096 32 +1.213861e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.335000e+04 1 256 +2.360867e+04 2 256 +2.368335e+04 4 256 +### CPU: scaling test 32 +2.236539e+04 1 32 +2.311725e+04 2 32 +2.306838e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.370978e+04 1 256 +4.405634e+04 2 256 +4.456211e+04 4 256 +### CPU: scaling test 32 +3.836659e+04 1 32 +4.179709e+04 2 32 +4.369754e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.926025e+04 1 256 +8.558488e+04 2 256 +8.539748e+04 4 256 +### CPU: scaling test 32 +8.398708e+04 1 32 +8.906950e+04 2 32 +8.745810e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.556008e+04 1 256 +9.646045e+04 2 256 +9.528700e+04 4 256 +### CPU: scaling test 32 +8.322886e+04 1 32 +8.916295e+04 2 32 +9.000274e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.425669e+04 1 256 +6.732158e+04 2 256 +6.696446e+04 4 256 +### CPU: scaling test 32 +6.780265e+04 1 32 +6.786649e+04 2 32 +6.753983e+04 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 028292e268..53423221d6 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:00:14 +DATE: 2025-10-11_15:20:08 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.612194e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.849217e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.964394e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.590985e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.195514e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.215933e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.477846 sec -INFO: No Floating Point Exceptions have been reported - 1,998,983,760 cycles # 2.871 GHz - 2,812,176,587 instructions # 1.41 insn per cycle - 0.759674168 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.475543 sec + 2,072,965,387 cycles # 2.836 GHz + 2,812,513,904 instructions # 1.36 insn per cycle + 0.789686961 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.042987e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.232338e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.242858e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.134307e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.362144e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.374708e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.621402 sec -INFO: No Floating Point Exceptions have been reported - 2,510,286,495 cycles # 2.883 GHz - 3,752,986,245 instructions # 1.50 insn per cycle - 0.931747637 seconds time elapsed +TOTAL : 0.566501 sec + 2,402,738,046 cycles # 2.849 GHz + 3,415,144,104 instructions # 1.42 insn per cycle + 0.902303425 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/GPU) = 1.4131213684418646 +Relative difference = 4.4692399902091566e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.434605e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.446812e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.446812e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.360536e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.372172e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.372172e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.752117 sec -INFO: No Floating Point Exceptions have been reported - 19,916,103,310 cycles # 2.949 GHz - 59,916,518,373 instructions # 3.01 insn per cycle - 6.756066066 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1199) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.962552 sec + 20,052,897,229 cycles # 2.879 GHz + 60,517,484,268 instructions # 3.02 insn per cycle + 6.966626285 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.568526e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.611480e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.611480e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.457200e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.498681e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.498681e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.606956 sec -INFO: No Floating Point Exceptions have been reported - 10,571,212,167 cycles # 2.928 GHz - 31,086,653,440 instructions # 2.94 insn per cycle - 3.611892241 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5221) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.696167 sec + 10,707,329,548 cycles # 2.895 GHz + 31,170,881,652 instructions # 2.91 insn per cycle + 3.700212507 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5107) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.091675e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.256165e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.256165e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.870920e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.029877e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.029877e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.823998 sec -INFO: No Floating Point Exceptions have been reported - 4,999,238,647 cycles # 2.738 GHz - 11,406,827,724 instructions # 2.28 insn per cycle - 1.827985092 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4642) (512y: 0) (512z: 0) +TOTAL : 1.867542 sec + 5,077,134,246 cycles # 2.714 GHz + 11,510,163,524 instructions # 2.27 insn per cycle + 1.871736808 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4658) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416466 Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.026950e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.047965e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.047965e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.650179e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.846221e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.846221e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.617207 sec -INFO: No Floating Point Exceptions have been reported - 4,447,500,259 cycles # 2.747 GHz - 10,665,398,274 instructions # 2.40 insn per cycle - 1.621167175 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4378) (512y: 92) (512z: 0) +TOTAL : 1.718355 sec + 4,666,627,650 cycles # 2.711 GHz + 10,813,430,115 instructions # 2.32 insn per cycle + 1.722417533 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4482) (512y: 57) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416466 Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.168386e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.273905e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.273905e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.895380e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.991775e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.991775e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.309115 sec -INFO: No Floating Point Exceptions have been reported - 4,128,751,307 cycles # 1.785 GHz - 5,972,449,468 instructions # 1.45 insn per cycle - 2.314144205 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1620) (512y: 94) (512z: 3577) +TOTAL : 2.398459 sec + 4,202,110,606 cycles # 1.750 GHz + 6,028,015,369 instructions # 1.43 insn per cycle + 2.402798408 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1720) (512y: 63) (512z: 3552) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..88f80f3081 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:56:53 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +3.480668e+05 1 256 +6.757720e+05 2 256 +1.342710e+06 4 256 +1.961408e+06 8 256 +2.863939e+06 16 256 +3.692840e+06 32 256 +4.108363e+06 64 256 +4.389055e+06 128 256 +4.590159e+06 256 256 +4.677980e+06 512 256 +4.719776e+06 1024 256 +### GPU: scaling test 32 +5.093214e+04 1 32 +9.453332e+04 2 32 +1.923664e+05 4 32 +3.828673e+05 8 32 +7.100352e+05 16 32 +1.286052e+06 32 32 +2.074968e+06 64 32 +2.993421e+06 128 32 +3.590529e+06 256 32 +4.025040e+06 512 32 +4.233186e+06 1024 32 +4.428606e+06 2048 32 +4.494795e+06 4096 32 +4.506986e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.283518e+04 1 256 +2.360000e+04 2 256 +2.368362e+04 4 256 +### CPU: scaling test 32 +2.195483e+04 1 32 +2.267087e+04 2 32 +2.328199e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.369761e+04 1 256 +4.426783e+04 2 256 +4.443961e+04 4 256 +### CPU: scaling test 32 +4.205894e+04 1 32 +4.154644e+04 2 32 +4.180789e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.635620e+04 1 256 +8.373531e+04 2 256 +8.654539e+04 4 256 +### CPU: scaling test 32 +8.995865e+04 1 32 +8.789712e+04 2 32 +8.901054e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.711265e+04 1 256 +9.722643e+04 2 256 +9.347803e+04 4 256 +### CPU: scaling test 32 +9.518909e+04 1 32 +9.721140e+04 2 32 +9.724959e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.678497e+04 1 256 +6.627189e+04 2 256 +6.803332e+04 4 256 +### CPU: scaling test 32 +6.749432e+04 1 32 +6.701283e+04 2 32 +6.598727e+04 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 76636470b0..5ea3c579b2 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,272 +10,231 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:38:23 +DATE: 2025-10-11_16:29:39 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.472313e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.180220e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.180220e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.808698e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.065448e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.065448e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.504857 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,064,539,289 cycles # 2.862 GHz - 3,123,566,672 instructions # 1.51 insn per cycle - 0.778239097 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +TOTAL : 0.500490 sec + 2,152,747,639 cycles # 2.835 GHz + 3,089,120,012 instructions # 1.43 insn per cycle + 0.817131761 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.683325e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.341961e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.341961e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.720979e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.001076e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.001076e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.833212 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 3,141,452,514 cycles # 2.889 GHz - 4,965,295,428 instructions # 1.58 insn per cycle - 1.145190233 seconds time elapsed +TOTAL : 0.786088 sec + 3,079,796,138 cycles # 2.856 GHz + 4,693,820,986 instructions # 1.52 insn per cycle + 1.137301736 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/GPU) = 1.4131213684418646 +Relative difference = 4.4692399902091566e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.439308e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.451643e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.451643e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.340726e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.352294e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.352294e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.745227 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 19,922,326,116 cycles # 2.952 GHz - 59,921,657,661 instructions # 3.01 insn per cycle - 6.749767217 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1199) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.027688 sec + 20,121,022,602 cycles # 2.862 GHz + 60,520,827,051 instructions # 3.01 insn per cycle + 7.031786887 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.590762e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.634359e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.634359e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.433303e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.475603e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.475603e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.596308 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 10,606,556,243 cycles # 2.946 GHz - 31,132,640,347 instructions # 2.94 insn per cycle - 3.600784290 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5221) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.724019 sec + 10,754,955,259 cycles # 2.886 GHz + 31,220,075,253 instructions # 2.90 insn per cycle + 3.728441609 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5107) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.045361e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.212711e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.212711e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.799230e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.961399e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.961399e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.840181 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 5,054,435,549 cycles # 2.741 GHz - 11,457,891,523 instructions # 2.27 insn per cycle - 1.844724432 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4642) (512y: 0) (512z: 0) +TOTAL : 1.890149 sec + 5,120,442,526 cycles # 2.704 GHz + 11,558,215,171 instructions # 2.26 insn per cycle + 1.894456584 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4658) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416466 Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.028589e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.049854e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.049854e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.595269e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.785975e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.785975e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.621206 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,484,828,096 cycles # 2.760 GHz - 10,715,944,638 instructions # 2.39 insn per cycle - 1.625802151 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4378) (512y: 92) (512z: 0) +TOTAL : 1.735302 sec + 4,701,578,061 cycles # 2.704 GHz + 10,861,447,059 instructions # 2.31 insn per cycle + 1.739681098 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4482) (512y: 57) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416466 Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.165257e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.268564e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.268564e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.737162e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.834485e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.834485e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.316443 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,162,925,866 cycles # 1.795 GHz - 6,008,954,577 instructions # 1.44 insn per cycle - 2.321140123 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1620) (512y: 94) (512z: 3577) +TOTAL : 2.462185 sec + 4,238,690,147 cycles # 1.719 GHz + 6,064,850,138 instructions # 1.43 insn per cycle + 2.466509903 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1720) (512y: 63) (512z: 3552) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 49402063e2..2fc1d7dc04 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:00:40 +DATE: 2025-10-11_15:20:41 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.575064e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.921304e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.028957e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.786288e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.203485e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.221467e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.477703 sec -INFO: No Floating Point Exceptions have been reported - 1,994,590,518 cycles # 2.865 GHz - 2,848,992,929 instructions # 1.43 insn per cycle - 0.754407053 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.470896 sec + 2,028,123,419 cycles # 2.825 GHz + 2,812,031,573 instructions # 1.39 insn per cycle + 0.775558684 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.042325e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.231825e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.242712e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.146437e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.383510e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.397548e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.616679 sec -INFO: No Floating Point Exceptions have been reported - 2,463,746,118 cycles # 2.874 GHz - 3,716,874,386 instructions # 1.51 insn per cycle - 0.917442132 seconds time elapsed +TOTAL : 0.569288 sec + 2,428,652,206 cycles # 2.852 GHz + 3,427,874,591 instructions # 1.41 insn per cycle + 0.912714324 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/GPU) = 1.4131213684418646 +Relative difference = 4.4692399902091566e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.437110e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.449363e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.449363e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.386609e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.398461e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.398461e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.744189 sec -INFO: No Floating Point Exceptions have been reported - 19,899,963,729 cycles # 2.950 GHz - 60,130,622,589 instructions # 3.02 insn per cycle - 6.748077481 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1322) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.886307 sec + 19,965,917,518 cycles # 2.898 GHz + 60,201,240,687 instructions # 3.02 insn per cycle + 6.890252778 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.632122e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.676125e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.676125e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.533737e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.576916e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.576916e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.557509 sec -INFO: No Floating Point Exceptions have been reported - 10,482,296,489 cycles # 2.944 GHz - 30,686,942,862 instructions # 2.93 insn per cycle - 3.561419011 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5047) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.633851 sec + 10,579,683,505 cycles # 2.909 GHz + 30,847,655,837 instructions # 2.92 insn per cycle + 3.638097883 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4930) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.842314e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.999775e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.999775e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.536026e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.682366e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.682366e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.874706 sec -INFO: No Floating Point Exceptions have been reported - 5,138,957,277 cycles # 2.738 GHz - 11,840,408,683 instructions # 2.30 insn per cycle - 1.878700358 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4746) (512y: 0) (512z: 0) +TOTAL : 1.939515 sec + 5,249,266,634 cycles # 2.702 GHz + 11,982,858,846 instructions # 2.28 insn per cycle + 1.943675108 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4772) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416466 Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.602387e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.789550e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.789550e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.187873e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.358429e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.358429e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.727070 sec -INFO: No Floating Point Exceptions have been reported - 4,726,480,466 cycles # 2.731 GHz - 11,165,052,550 instructions # 2.36 insn per cycle - 1.731070886 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4403) (512y: 246) (512z: 0) +TOTAL : 1.803322 sec + 4,846,320,602 cycles # 2.683 GHz + 11,310,325,393 instructions # 2.33 insn per cycle + 1.807176987 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4455) (512y: 231) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416466 Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.101185e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.203049e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.203049e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.783861e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.878450e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.878450e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.329881 sec -INFO: No Floating Point Exceptions have been reported - 4,155,200,887 cycles # 1.781 GHz - 6,223,800,996 instructions # 1.50 insn per cycle - 2.334090572 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1516) (512y: 139) (512z: 3679) +TOTAL : 2.437468 sec + 4,222,471,079 cycles # 1.730 GHz + 6,310,155,112 instructions # 1.49 insn per cycle + 2.441536708 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1619) (512y: 119) (512z: 3648) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling new file mode 100644 index 0000000000..66fa52db02 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:42:24 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.020563e+06 1 256 +1.907125e+06 2 256 +3.779714e+06 4 256 +7.211953e+06 8 256 +1.376478e+07 16 256 +2.148631e+07 32 256 +2.475235e+07 64 256 +2.658152e+07 128 256 +2.709334e+07 256 256 +2.813503e+07 512 256 +2.865513e+07 1024 256 +### GPU: scaling test 32 +1.249239e+05 1 32 +2.576023e+05 2 32 +5.236416e+05 4 32 +9.816703e+05 8 32 +1.909308e+06 16 32 +3.564529e+06 32 32 +7.104303e+06 64 32 +1.425315e+07 128 32 +2.099087e+07 256 32 +2.446553e+07 512 32 +2.604809e+07 1024 32 +2.693465e+07 2048 32 +2.780197e+07 4096 32 +2.832618e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.475086e+04 1 256 +2.477196e+04 2 256 +2.498053e+04 4 256 +### CPU: scaling test 32 +2.306794e+04 1 32 +2.472476e+04 2 32 +2.481117e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.800127e+04 1 256 +7.895709e+04 2 256 +7.905572e+04 4 256 +### CPU: scaling test 32 +7.190850e+04 1 32 +7.327190e+04 2 32 +7.683355e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.743170e+05 1 256 +1.714585e+05 2 256 +1.739702e+05 4 256 +### CPU: scaling test 32 +1.605789e+05 1 32 +1.673207e+05 2 32 +1.747798e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.847081e+05 1 256 +1.886928e+05 2 256 +1.844591e+05 4 256 +### CPU: scaling test 32 +1.678389e+05 1 32 +1.901615e+05 2 32 +1.805064e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.398580e+05 1 256 +1.377336e+05 2 256 +1.394286e+05 4 256 +### CPU: scaling test 32 +1.350638e+05 1 32 +1.419406e+05 2 32 +1.392215e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index b4d9344f80..359e7877d9 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,226 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:01:57 +DATE: 2025-10-11_15:22:22 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.641235e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.015793e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.057654e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.457409 sec -INFO: No Floating Point Exceptions have been reported - 1,937,244,275 cycles # 2.867 GHz - 2,710,892,637 instructions # 1.40 insn per cycle - 0.733854811 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.012111e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.590020e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.652888e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002446e+01 ) GeV^-2 +TOTAL : 0.461660 sec + 2,024,209,134 cycles # 2.804 GHz + 2,785,160,230 instructions # 1.38 insn per cycle + 0.779091198 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 211 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.672412e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.384843e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.427387e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.304364e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.823335e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.855285e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 -TOTAL : 0.509900 sec -INFO: No Floating Point Exceptions have been reported - 2,162,696,786 cycles # 2.871 GHz - 3,100,226,347 instructions # 1.43 insn per cycle - 0.811215095 seconds time elapsed +TOTAL : 0.506727 sec + 2,201,759,148 cycles # 2.852 GHz + 3,068,173,195 instructions # 1.39 insn per cycle + 0.828420263 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.412607e+00 -Avg ME (F77/GPU) = 1.4132214305330990 -Relative difference = 0.0004349621183379836 +Avg ME (F77/GPU) = 1.4132214458495582 +Relative difference = 0.0004349729610275725 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.513642e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.526564e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.526564e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.501069e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.514090e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.514090e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.537186 sec -INFO: No Floating Point Exceptions have been reported - 19,278,711,706 cycles # 2.948 GHz - 59,616,757,005 instructions # 3.09 insn per cycle - 6.541004954 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 959) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.569879 sec + 19,152,579,978 cycles # 2.914 GHz + 59,680,745,465 instructions # 3.12 insn per cycle + 6.573833440 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 926) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 Avg ME (F77/C++) = 1.4129949096991936 Relative difference = 6.390737857384068e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.120315e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.259615e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.259615e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.920524e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.053952e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.053952e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.035209 sec -INFO: No Floating Point Exceptions have been reported - 6,010,527,138 cycles # 2.949 GHz - 17,061,942,080 instructions # 2.84 insn per cycle - 2.038918474 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5856) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.086277 sec + 6,057,068,110 cycles # 2.899 GHz + 17,105,898,955 instructions # 2.82 insn per cycle + 2.090214636 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129954647353316 -Relative difference = 3.2890090308261873e-07 +Avg ME (F77/C++) = 1.4129954481297773 +Relative difference = 3.171488768794332e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.748972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.811746e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.811746e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.680104e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.737565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.737565e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.954915 sec -INFO: No Floating Point Exceptions have been reported - 2,640,169,352 cycles # 2.756 GHz - 6,187,458,591 instructions # 2.34 insn per cycle - 0.958678404 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5105) (512y: 0) (512z: 0) +TOTAL : 0.993425 sec + 2,677,007,034 cycles # 2.687 GHz + 6,240,512,600 instructions # 2.33 insn per cycle + 0.997226702 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132969790267 -Relative difference = 2.1012969292986113e-07 +Avg ME (F77/C++) = 1.4133132974634464 +Relative difference = 2.104724475889719e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.923079e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.998771e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.998771e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.843149e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.912179e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.912179e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.870044 sec -INFO: No Floating Point Exceptions have been reported - 2,402,321,989 cycles # 2.751 GHz - 5,790,080,813 instructions # 2.41 insn per cycle - 0.873863245 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4906) (512y: 37) (512z: 0) +TOTAL : 0.907079 sec + 2,478,306,991 cycles # 2.723 GHz + 5,867,870,372 instructions # 2.37 insn per cycle + 0.910927509 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5009) (512y: 2) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132969790267 -Relative difference = 2.1012969292986113e-07 +Avg ME (F77/C++) = 1.4133132974634464 +Relative difference = 2.104724475889719e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.455132e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.498332e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.498332e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.382994e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.423338e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.423338e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.146003 sec -INFO: No Floating Point Exceptions have been reported - 2,072,911,951 cycles # 1.804 GHz - 3,391,607,808 instructions # 1.64 insn per cycle - 1.149850121 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2237) (512y: 37) (512z: 3789) +TOTAL : 1.206279 sec + 2,116,978,988 cycles # 1.750 GHz + 3,424,879,930 instructions # 1.62 insn per cycle + 1.210305817 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2346) (512y: 7) (512z: 3767) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 +Avg ME (F77/C++) = 1.4133162104498354 +Relative difference = 1.48905011572879e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..03b7dc0471 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:58:16 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +3.727486e+05 1 256 +7.374228e+05 2 256 +1.359495e+06 4 256 +2.228941e+06 8 256 +3.376485e+06 16 256 +4.469020e+06 32 256 +5.249324e+06 64 256 +5.869764e+06 128 256 +6.094954e+06 256 256 +6.260097e+06 512 256 +6.357949e+06 1024 256 +### GPU: scaling test 32 +5.112115e+04 1 32 +9.374377e+04 2 32 +1.887009e+05 4 32 +3.960359e+05 8 32 +7.300603e+05 16 32 +1.308116e+06 32 32 +1.995847e+06 64 32 +3.417585e+06 128 32 +4.455777e+06 256 32 +5.284200e+06 512 32 +5.826269e+06 1024 32 +6.082445e+06 2048 32 +6.255269e+06 4096 32 +6.329872e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.438060e+04 1 256 +2.470219e+04 2 256 +2.476066e+04 4 256 +### CPU: scaling test 32 +2.461887e+04 1 32 +2.470134e+04 2 32 +2.410740e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.129456e+04 1 256 +7.835869e+04 2 256 +7.787307e+04 4 256 +### CPU: scaling test 32 +6.724611e+04 1 32 +6.848385e+04 2 32 +7.303564e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.606597e+05 1 256 +1.630584e+05 2 256 +1.606208e+05 4 256 +### CPU: scaling test 32 +1.551508e+05 1 32 +1.588322e+05 2 32 +1.636465e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.742285e+05 1 256 +1.758288e+05 2 256 +1.738872e+05 4 256 +### CPU: scaling test 32 +1.750902e+05 1 32 +1.718448e+05 2 32 +1.870659e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.405438e+05 1 256 +1.389272e+05 2 256 +1.380473e+05 4 256 +### CPU: scaling test 32 +1.416732e+05 1 32 +1.383910e+05 2 32 +1.393492e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index 89f1af02c0..b34d8177c5 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,275 +10,234 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:38:49 +DATE: 2025-10-11_16:30:12 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.430077e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.496267e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.496267e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.468595 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,993,529,576 cycles # 2.878 GHz - 2,894,144,626 instructions # 1.45 insn per cycle - 0.749153323 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +EvtsPerSec[Rmb+ME] (23) = ( 4.563182e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.822216e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.822216e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.009070e+02 +- 5.002294e+01 ) GeV^-2 +TOTAL : 0.474333 sec + 2,020,095,914 cycles # 2.815 GHz + 2,863,432,755 instructions # 1.42 insn per cycle + 0.775295436 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 211 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.508973e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.254431e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.254431e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.400607e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.017646e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.017646e+07 ) sec^-1 MeanMatrixElemValue = ( 6.737499e+02 +- 4.776369e+02 ) GeV^-2 -TOTAL : 0.658615 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,580,648,777 cycles # 2.888 GHz - 3,894,936,658 instructions # 1.51 insn per cycle - 0.952346890 seconds time elapsed +TOTAL : 0.650114 sec + 2,601,943,365 cycles # 2.840 GHz + 3,913,396,482 instructions # 1.50 insn per cycle + 0.976170377 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.412607e+00 -Avg ME (F77/GPU) = 1.4132214305330990 -Relative difference = 0.0004349621183379836 +Avg ME (F77/GPU) = 1.4132214458495582 +Relative difference = 0.0004349729610275725 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.506368e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.519408e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.519408e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.486527e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.499486e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.499486e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.559975 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 19,280,512,048 cycles # 2.938 GHz - 59,619,141,119 instructions # 3.09 insn per cycle - 6.564243260 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 959) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.611886 sec + 19,177,870,695 cycles # 2.899 GHz + 59,684,285,229 instructions # 3.11 insn per cycle + 6.615966746 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 926) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 Avg ME (F77/C++) = 1.4129949096991936 Relative difference = 6.390737857384068e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.092271e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.230160e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.230160e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.840675e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.974875e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.974875e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.047307 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,043,775,465 cycles # 2.947 GHz - 17,111,089,922 instructions # 2.83 insn per cycle - 2.051614364 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5856) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.112189 sec + 6,078,517,802 cycles # 2.874 GHz + 17,153,031,314 instructions # 2.82 insn per cycle + 2.116275288 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129954647353316 -Relative difference = 3.2890090308261873e-07 +Avg ME (F77/C++) = 1.4129954481297773 +Relative difference = 3.171488768794332e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.748354e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.809701e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.809701e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.674765e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.733725e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.733725e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.959425 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,659,679,748 cycles # 2.761 GHz - 6,224,393,438 instructions # 2.34 insn per cycle - 0.963869172 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5105) (512y: 0) (512z: 0) +TOTAL : 1.001010 sec + 2,696,240,098 cycles # 2.685 GHz + 6,276,404,164 instructions # 2.33 insn per cycle + 1.005076444 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132969790267 -Relative difference = 2.1012969292986113e-07 +Avg ME (F77/C++) = 1.4133132974634464 +Relative difference = 2.104724475889719e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.927524e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.002486e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.002486e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.832147e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.902384e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.902384e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.872058 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,421,094,163 cycles # 2.765 GHz - 5,826,830,021 instructions # 2.41 insn per cycle - 0.876372578 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4906) (512y: 37) (512z: 0) +TOTAL : 0.916582 sec + 2,498,079,452 cycles # 2.717 GHz + 5,903,755,317 instructions # 2.36 insn per cycle + 0.920755361 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5009) (512y: 2) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132969790267 -Relative difference = 2.1012969292986113e-07 +Avg ME (F77/C++) = 1.4133132974634464 +Relative difference = 2.104724475889719e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.443486e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.486864e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.486864e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.388850e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.429977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.429977e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.160150 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,098,432,349 cycles # 1.804 GHz - 3,433,067,927 instructions # 1.64 insn per cycle - 1.164579445 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2237) (512y: 37) (512z: 3789) +TOTAL : 1.204887 sec + 2,137,027,835 cycles # 1.769 GHz + 3,465,402,298 instructions # 1.62 insn per cycle + 1.209022745 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2346) (512y: 7) (512z: 3767) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 +Avg ME (F77/C++) = 1.4133162104498354 +Relative difference = 1.48905011572879e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 7537d3c84d..1d664001ba 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,226 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:02:18 +DATE: 2025-10-11_15:22:52 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.658659e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.027503e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.066373e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.462988 sec -INFO: No Floating Point Exceptions have been reported - 1,956,715,427 cycles # 2.872 GHz - 2,757,694,861 instructions # 1.41 insn per cycle - 0.742544959 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.986981e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.577936e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.642909e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002446e+01 ) GeV^-2 +TOTAL : 0.465752 sec + 2,027,464,804 cycles # 2.839 GHz + 2,776,602,524 instructions # 1.37 insn per cycle + 0.772091406 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 203 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.669827e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.371215e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.415741e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.311817e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.830173e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.862677e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 -TOTAL : 0.505811 sec -INFO: No Floating Point Exceptions have been reported - 2,123,611,289 cycles # 2.883 GHz - 3,083,974,467 instructions # 1.45 insn per cycle - 0.793454464 seconds time elapsed +TOTAL : 0.507862 sec + 2,193,078,964 cycles # 2.843 GHz + 3,061,556,319 instructions # 1.40 insn per cycle + 0.829701653 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.412607e+00 -Avg ME (F77/GPU) = 1.4132214305330990 -Relative difference = 0.0004349621183379836 +Avg ME (F77/GPU) = 1.4132214458495582 +Relative difference = 0.0004349729610275725 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.488365e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.501255e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.501255e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.494083e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.506993e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.506993e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.602997 sec -INFO: No Floating Point Exceptions have been reported - 19,409,400,884 cycles # 2.938 GHz - 59,351,848,666 instructions # 3.06 insn per cycle - 6.606759387 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1027) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.588418 sec + 19,053,983,564 cycles # 2.891 GHz + 59,396,932,644 instructions # 3.12 insn per cycle + 6.592397812 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 868) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 Avg ME (F77/C++) = 1.4129949096991936 Relative difference = 6.390737857384068e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.484090e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.633368e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.633368e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.236693e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.382500e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.382500e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.948837 sec -INFO: No Floating Point Exceptions have been reported - 5,764,162,956 cycles # 2.953 GHz - 16,849,716,772 instructions # 2.92 insn per cycle - 1.952678468 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5610) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.007204 sec + 5,773,782,949 cycles # 2.872 GHz + 16,883,450,737 instructions # 2.92 insn per cycle + 2.011190459 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5486) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129954647353316 -Relative difference = 3.2890090308261873e-07 +Avg ME (F77/C++) = 1.4129954481297773 +Relative difference = 3.171488768794332e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.522405e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.569181e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.569181e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.456033e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.499646e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.499646e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.094041 sec -INFO: No Floating Point Exceptions have been reported - 3,018,102,108 cycles # 2.750 GHz - 6,848,568,360 instructions # 2.27 insn per cycle - 1.098202042 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5735) (512y: 0) (512z: 0) +TOTAL : 1.143466 sec + 3,080,089,782 cycles # 2.686 GHz + 6,901,917,276 instructions # 2.24 insn per cycle + 1.147397013 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5760) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132969790267 -Relative difference = 2.1012969292986113e-07 +Avg ME (F77/C++) = 1.4133132974634464 +Relative difference = 2.104724475889719e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.654265e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.710055e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.710055e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.551832e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.601891e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.601891e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.008735 sec -INFO: No Floating Point Exceptions have been reported - 2,794,533,058 cycles # 2.762 GHz - 6,437,695,564 instructions # 2.30 insn per cycle - 1.012558685 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5509) (512y: 23) (512z: 0) +TOTAL : 1.074026 sec + 2,869,050,546 cycles # 2.664 GHz + 6,490,617,462 instructions # 2.26 insn per cycle + 1.077819814 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5562) (512y: 8) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132969790267 -Relative difference = 2.1012969292986113e-07 +Avg ME (F77/C++) = 1.4133132974634464 +Relative difference = 2.104724475889719e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.323435e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.360072e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.360072e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.278723e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.313246e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.313246e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.258302 sec -INFO: No Floating Point Exceptions have been reported - 2,251,923,496 cycles # 1.787 GHz - 3,755,291,572 instructions # 1.67 insn per cycle - 1.262174564 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2467) (512y: 28) (512z: 4084) +TOTAL : 1.301798 sec + 2,284,363,028 cycles # 1.751 GHz + 3,800,071,631 instructions # 1.66 insn per cycle + 1.305803750 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2577) (512y: 9) (512z: 4061) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 +Avg ME (F77/C++) = 1.4133162104498354 +Relative difference = 1.48905011572879e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling new file mode 100644 index 0000000000..61f28ab393 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:42:03 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +9.413980e+05 1 256 +1.824479e+06 2 256 +3.751768e+06 4 256 +6.821687e+06 8 256 +8.893057e+06 16 256 +1.069198e+07 32 256 +1.203562e+07 64 256 +1.299650e+07 128 256 +1.326879e+07 256 256 +1.353754e+07 512 256 +1.376766e+07 1024 256 +### GPU: scaling test 32 +1.264842e+05 1 32 +2.411881e+05 2 32 +5.002345e+05 4 32 +8.959915e+05 8 32 +1.929825e+06 16 32 +3.400412e+06 32 32 +6.965891e+06 64 32 +9.374242e+06 128 32 +1.031547e+07 256 32 +1.114517e+07 512 32 +1.169216e+07 1024 32 +1.186544e+07 2048 32 +1.211002e+07 4096 32 +1.215036e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.309135e+04 1 256 +2.331383e+04 2 256 +2.334383e+04 4 256 +### CPU: scaling test 32 +2.173266e+04 1 32 +2.264555e+04 2 32 +2.214409e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.454087e+04 1 256 +4.509478e+04 2 256 +4.547146e+04 4 256 +### CPU: scaling test 32 +4.000635e+04 1 32 +4.240489e+04 2 32 +4.447787e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.989478e+04 1 256 +8.788512e+04 2 256 +9.013990e+04 4 256 +### CPU: scaling test 32 +9.025857e+04 1 32 +9.054908e+04 2 32 +8.932416e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.982270e+04 1 256 +9.959330e+04 2 256 +9.964108e+04 4 256 +### CPU: scaling test 32 +9.318362e+04 1 32 +1.002699e+05 2 32 +9.968832e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.767141e+04 1 256 +6.818529e+04 2 256 +6.881658e+04 4 256 +### CPU: scaling test 32 +6.813396e+04 1 32 +6.831571e+04 2 32 +6.860475e+04 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 6b4617ba56..66176b2229 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:01:06 +DATE: 2025-10-11_15:21:14 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.531107e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.896113e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.014318e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.723520e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201379e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219641e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.482291 sec -INFO: No Floating Point Exceptions have been reported - 1,996,726,100 cycles # 2.869 GHz - 2,875,927,393 instructions # 1.44 insn per cycle - 0.757518934 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.472516 sec + 2,054,090,006 cycles # 2.841 GHz + 2,817,756,219 instructions # 1.37 insn per cycle + 0.780308929 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 44 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.039985e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.227093e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.238483e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.127139e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.354786e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.367576e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.618226 sec -INFO: No Floating Point Exceptions have been reported - 2,476,524,825 cycles # 2.883 GHz - 3,787,822,568 instructions # 1.53 insn per cycle - 0.918414719 seconds time elapsed +TOTAL : 0.567470 sec + 2,434,469,025 cycles # 2.854 GHz + 3,429,413,924 instructions # 1.41 insn per cycle + 0.911221936 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 +Avg ME (F77/GPU) = 1.4131213912822083 +Relative difference = 4.3076096170606456e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.396101e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.408087e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.408087e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.325558e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.336921e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.336921e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.859626 sec -INFO: No Floating Point Exceptions have been reported - 20,206,369,377 cycles # 2.945 GHz - 60,950,595,896 instructions # 3.02 insn per cycle - 6.863727850 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1220) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.066864 sec + 20,436,241,353 cycles # 2.891 GHz + 61,613,414,820 instructions # 3.01 insn per cycle + 7.070927861 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.651759e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.695029e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.695029e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.581252e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.624148e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.624148e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.542669 sec -INFO: No Floating Point Exceptions have been reported - 10,470,195,857 cycles # 2.953 GHz - 30,822,635,750 instructions # 2.94 insn per cycle - 3.546724112 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5351) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.596315 sec + 10,491,200,280 cycles # 2.915 GHz + 30,713,063,869 instructions # 2.93 insn per cycle + 3.600269209 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213792564823 -Relative difference = 4.392710025734405e-07 +Avg ME (F77/C++) = 1.4131213813302705 +Relative difference = 4.3780348012864624e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.177717e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.345070e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.345070e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.021587e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.189187e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.189187e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.805877 sec -INFO: No Floating Point Exceptions have been reported - 4,960,900,655 cycles # 2.742 GHz - 11,360,293,322 instructions # 2.29 insn per cycle - 1.809915904 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4776) (512y: 0) (512z: 0) +TOTAL : 1.836324 sec + 4,963,572,150 cycles # 2.698 GHz + 11,329,877,800 instructions # 2.28 insn per cycle + 1.840366477 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4650) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 +Avg ME (F77/C++) = 1.4131213646773610 +Relative difference = 4.495879612249832e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.047166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.068679e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.068679e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.809724e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.000340e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000340e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.585052 sec -INFO: No Floating Point Exceptions have been reported - 4,379,448,731 cycles # 2.757 GHz - 10,610,063,505 instructions # 2.42 insn per cycle - 1.588995755 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4503) (512y: 84) (512z: 0) +TOTAL : 1.690468 sec + 4,546,028,597 cycles # 2.684 GHz + 10,641,089,172 instructions # 2.34 insn per cycle + 1.694422805 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4468) (512y: 47) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 +Avg ME (F77/C++) = 1.4131213646773610 +Relative difference = 4.495879612249832e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.890582e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.987179e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.987179e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.931835e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.029866e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.029866e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.401138 sec -INFO: No Floating Point Exceptions have been reported - 4,243,505,288 cycles # 1.765 GHz - 6,171,567,257 instructions # 1.45 insn per cycle - 2.405218093 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2143) (512y: 116) (512z: 3653) +TOTAL : 2.386097 sec + 4,162,019,401 cycles # 1.742 GHz + 5,999,960,287 instructions # 1.44 insn per cycle + 2.390275923 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1724) (512y: 63) (512z: 3594) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213786174055 Relative difference = 4.3972324717191576e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..d8428305ae --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:57:35 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.849872e+05 1 256 +5.950036e+05 2 256 +1.135532e+06 4 256 +9.336754e+05 8 256 +2.668945e+06 16 256 +3.526097e+06 32 256 +4.045575e+06 64 256 +4.557983e+06 128 256 +4.782891e+06 256 256 +4.835057e+06 512 256 +4.861240e+06 1024 256 +### GPU: scaling test 32 +3.826136e+04 1 32 +7.325127e+04 2 32 +1.481027e+05 4 32 +3.040622e+05 8 32 +6.040500e+05 16 32 +1.089306e+06 32 32 +1.777835e+06 64 32 +2.826455e+06 128 32 +3.481738e+06 256 32 +3.995216e+06 512 32 +4.416099e+06 1024 32 +4.561881e+06 2048 32 +4.594627e+06 4096 32 +4.620875e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.314037e+04 1 256 +2.324071e+04 2 256 +2.351748e+04 4 256 +### CPU: scaling test 32 +2.156289e+04 1 32 +2.224284e+04 2 32 +2.270647e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.464955e+04 1 256 +4.456312e+04 2 256 +4.557593e+04 4 256 +### CPU: scaling test 32 +3.776841e+04 1 32 +4.243663e+04 2 32 +4.407623e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.329077e+04 1 256 +8.946504e+04 2 256 +8.934937e+04 4 256 +### CPU: scaling test 32 +8.542423e+04 1 32 +9.061011e+04 2 32 +9.100728e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.619475e+04 1 256 +1.000794e+05 2 256 +9.841918e+04 4 256 +### CPU: scaling test 32 +9.793151e+04 1 32 +9.901818e+04 2 32 +9.971627e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.804216e+04 1 256 +6.812091e+04 2 256 +6.863263e+04 4 256 +### CPU: scaling test 32 +6.817141e+04 1 32 +6.704119e+04 2 32 +6.858619e+04 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 1a268fb0a6..b5540e725a 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:01:31 +DATE: 2025-10-11_15:21:49 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.506525e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.876419e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.986419e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.729045e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.193827e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.214345e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.475723 sec -INFO: No Floating Point Exceptions have been reported - 1,989,777,196 cycles # 2.876 GHz - 2,865,221,599 instructions # 1.44 insn per cycle - 0.750464789 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.476302 sec + 2,069,585,848 cycles # 2.841 GHz + 2,809,792,568 instructions # 1.36 insn per cycle + 0.788016398 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 44 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.040967e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.229706e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.240646e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.148157e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.386565e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.400273e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.612359 sec -INFO: No Floating Point Exceptions have been reported - 2,465,408,367 cycles # 2.885 GHz - 3,759,784,229 instructions # 1.53 insn per cycle - 0.914073870 seconds time elapsed +TOTAL : 0.562536 sec + 2,368,600,308 cycles # 2.829 GHz + 3,390,907,468 instructions # 1.43 insn per cycle + 0.897403591 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 +Avg ME (F77/GPU) = 1.4131213912822083 +Relative difference = 4.3076096170606456e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.395973e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.407808e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.407808e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.347035e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.358476e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.358476e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.859771 sec -INFO: No Floating Point Exceptions have been reported - 20,239,178,144 cycles # 2.949 GHz - 61,173,779,461 instructions # 3.02 insn per cycle - 6.863706451 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1272) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.001676 sec + 20,340,735,873 cycles # 2.904 GHz + 61,296,698,560 instructions # 3.01 insn per cycle + 7.005669304 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.702334e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.747762e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.747762e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.588929e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.632804e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.632804e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.505938 sec -INFO: No Floating Point Exceptions have been reported - 10,333,154,234 cycles # 2.946 GHz - 30,534,348,115 instructions # 2.95 insn per cycle - 3.510016853 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5155) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.590204 sec + 10,378,021,696 cycles # 2.888 GHz + 30,395,025,188 instructions # 2.93 insn per cycle + 3.594207111 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4954) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213792564823 -Relative difference = 4.392710025734405e-07 +Avg ME (F77/C++) = 1.4131213813302705 +Relative difference = 4.3780348012864624e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.861323e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.018375e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.018375e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.624880e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.780155e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.780155e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.870783 sec -INFO: No Floating Point Exceptions have been reported - 5,160,894,050 cycles # 2.755 GHz - 11,875,310,688 instructions # 2.30 insn per cycle - 1.874839635 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4887) (512y: 0) (512z: 0) +TOTAL : 1.920064 sec + 5,168,529,008 cycles # 2.687 GHz + 11,822,995,259 instructions # 2.29 insn per cycle + 1.924192404 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4749) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 +Avg ME (F77/C++) = 1.4131213646773610 +Relative difference = 4.495879612249832e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.768245e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.957717e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.957717e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.374636e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.559382e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.559382e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.697611 sec -INFO: No Floating Point Exceptions have been reported - 4,679,050,155 cycles # 2.751 GHz - 11,168,862,734 instructions # 2.39 insn per cycle - 1.701628470 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4508) (512y: 239) (512z: 0) +TOTAL : 1.767863 sec + 4,740,196,866 cycles # 2.676 GHz + 11,146,224,662 instructions # 2.35 insn per cycle + 1.772001982 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4420) (512y: 221) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 +Avg ME (F77/C++) = 1.4131213646773610 +Relative difference = 4.495879612249832e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.922687e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.020028e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.020028e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.914882e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.012925e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.012925e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.390116 sec -INFO: No Floating Point Exceptions have been reported - 4,256,907,095 cycles # 1.778 GHz - 6,411,350,564 instructions # 1.51 insn per cycle - 2.394737171 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2039) (512y: 162) (512z: 3731) +TOTAL : 2.391894 sec + 4,182,595,672 cycles # 1.747 GHz + 6,238,269,996 instructions # 1.49 insn per cycle + 2.395956127 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1623) (512y: 120) (512z: 3678) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213786174055 Relative difference = 4.3972324717191576e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling new file mode 100644 index 0000000000..5a05ffd4cc --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:42:45 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.797622e+05 1 256 +3.709787e+05 2 256 +3.836692e+05 4 256 +4.274394e+05 8 256 +4.457291e+05 16 256 +4.426930e+05 32 256 +4.430121e+05 64 256 +4.414634e+05 128 256 +4.537983e+05 256 256 +4.587406e+05 512 256 +4.539498e+05 1024 256 +### GPU: scaling test 32 +5.646557e+04 1 32 +1.072891e+05 2 32 +1.807325e+05 4 32 +2.717613e+05 8 32 +3.826661e+05 16 32 +3.951829e+05 32 32 +4.316071e+05 64 32 +4.432349e+05 128 32 +4.449540e+05 256 32 +4.447744e+05 512 32 +4.444094e+05 1024 32 +4.520916e+05 2048 32 +4.578060e+05 4096 32 +4.571634e+05 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.852732e+03 1 256 +1.852838e+03 2 256 +1.863778e+03 4 256 +### CPU: scaling test 32 +1.849128e+03 1 32 +1.851000e+03 2 32 +1.853111e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.433326e+03 1 256 +3.428849e+03 2 256 +3.434375e+03 4 256 +### CPU: scaling test 32 +3.324011e+03 1 32 +3.385678e+03 2 32 +3.337661e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.888262e+03 1 256 +7.910674e+03 2 256 +7.940995e+03 4 256 +### CPU: scaling test 32 +7.181194e+03 1 32 +7.616753e+03 2 32 +7.493920e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.845276e+03 1 256 +8.896166e+03 2 256 +8.958296e+03 4 256 +### CPU: scaling test 32 +8.632795e+03 1 32 +8.574113e+03 2 32 +8.618805e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.742240e+03 1 256 +6.762831e+03 2 256 +6.833848e+03 4 256 +### CPU: scaling test 32 +6.602630e+03 1 32 +6.602109e+03 2 32 +6.640282e+03 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index fe9e9669c6..5da31552e6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:02:40 +DATE: 2025-10-11_15:23:20 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.331120e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.359202e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.361250e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.393219e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.441536e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.444704e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.543918 sec -INFO: No Floating Point Exceptions have been reported - 2,225,694,406 cycles # 2.884 GHz - 3,483,451,829 instructions # 1.57 insn per cycle - 0.837015502 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.499467 sec + 2,136,562,888 cycles # 2.840 GHz + 3,115,290,958 instructions # 1.46 insn per cycle + 0.813463478 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134422e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164730e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.165914e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.052190 sec -INFO: No Floating Point Exceptions have been reported - 9,689,726,748 cycles # 2.928 GHz - 22,118,867,491 instructions # 2.28 insn per cycle - 3.368998161 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.884002e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.884932e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.884932e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.853765e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.854661e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.854661e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.713540 sec -INFO: No Floating Point Exceptions have been reported - 25,683,805,881 cycles # 2.947 GHz - 78,963,253,936 instructions # 3.07 insn per cycle - 8.717598721 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.853472 sec + 25,658,433,103 cycles # 2.897 GHz + 78,568,001,018 instructions # 3.06 insn per cycle + 8.857417932 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.540501e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.543820e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.543820e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.376471e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.379465e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.379465e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.639000 sec -INFO: No Floating Point Exceptions have been reported - 13,090,618,968 cycles # 2.820 GHz - 39,561,040,325 instructions # 3.02 insn per cycle - 4.644193645 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.863682 sec + 13,076,523,489 cycles # 2.687 GHz + 39,590,979,607 instructions # 3.03 insn per cycle + 4.867732270 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.087246e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.103223e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.103223e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.895651e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.911901e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.911901e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.035017 sec -INFO: No Floating Point Exceptions have been reported - 5,608,597,608 cycles # 2.752 GHz - 13,825,354,537 instructions # 2.47 insn per cycle - 2.039075619 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11520) (512y: 0) (512z: 0) +TOTAL : 2.083250 sec + 5,645,439,415 cycles # 2.706 GHz + 13,860,388,601 instructions # 2.46 insn per cycle + 2.087459740 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.190120e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.211201e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.211201e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.894010e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.914275e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.914275e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.791765 sec -INFO: No Floating Point Exceptions have been reported - 4,921,067,926 cycles # 2.743 GHz - 12,507,200,724 instructions # 2.54 insn per cycle - 1.798123347 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10439) (512y: 89) (512z: 0) +TOTAL : 1.850375 sec + 5,008,092,310 cycles # 2.702 GHz + 12,556,513,170 instructions # 2.51 insn per cycle + 1.855114099 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.012553e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.024911e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.024911e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.736940e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.749376e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.749376e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.347251 sec -INFO: No Floating Point Exceptions have been reported - 4,147,263,675 cycles # 1.765 GHz - 6,394,266,077 instructions # 1.54 insn per cycle - 2.352573303 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1978) (512y: 101) (512z: 9386) +TOTAL : 2.440997 sec + 4,200,411,405 cycles # 1.718 GHz + 6,424,496,970 instructions # 1.53 insn per cycle + 2.445446290 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..30ffb7f326 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:58:57 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.872973e+05 1 256 +2.845184e+05 2 256 +3.112851e+05 4 256 +3.602269e+05 8 256 +3.862982e+05 16 256 +3.927910e+05 32 256 +3.975811e+05 64 256 +3.994813e+05 128 256 +3.982764e+05 256 256 +4.044121e+05 512 256 +4.143519e+05 1024 256 +### GPU: scaling test 32 +3.147853e+04 1 32 +5.985873e+04 2 32 +1.086414e+05 4 32 +1.846072e+05 8 32 +2.795140e+05 16 32 +3.171308e+05 32 32 +3.664746e+05 64 32 +3.861934e+05 128 32 +3.935760e+05 256 32 +3.959241e+05 512 32 +3.999573e+05 1024 32 +4.014811e+05 2048 32 +4.043590e+05 4096 32 +4.145995e+05 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.851734e+03 1 256 +1.852841e+03 2 256 +1.858966e+03 4 256 +### CPU: scaling test 32 +1.839862e+03 1 32 +1.843418e+03 2 32 +1.855242e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.376740e+03 1 256 +3.427003e+03 2 256 +3.418754e+03 4 256 +### CPU: scaling test 32 +3.343494e+03 1 32 +3.346688e+03 2 32 +3.350028e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.930406e+03 1 256 +7.927403e+03 2 256 +7.830665e+03 4 256 +### CPU: scaling test 32 +7.705971e+03 1 32 +7.749828e+03 2 32 +7.499380e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.438432e+03 1 256 +8.876320e+03 2 256 +8.867251e+03 4 256 +### CPU: scaling test 32 +8.678830e+03 1 32 +8.575889e+03 2 32 +8.706424e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.649041e+03 1 256 +6.668160e+03 2 256 +6.667655e+03 4 256 +### CPU: scaling test 32 +6.543129e+03 1 32 +6.626562e+03 2 32 +6.609869e+03 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt new file mode 100644 index 0000000000..ef3556442f --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt @@ -0,0 +1,223 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:52:22 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.934631e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.970660e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.973586e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.207545 sec + 4,504,483,186 cycles # 2.857 GHz + 6,247,204,557 instructions # 1.39 insn per cycle + 1.634328522 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626675e-04 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.840362e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.841255e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.841255e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.917657 sec + 25,674,151,776 cycles # 2.878 GHz + 78,572,254,617 instructions # 3.06 insn per cycle + 8.921718104 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198141133E-004 +Relative difference = 2.8372990776517314e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.319765e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.322676e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.322676e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.946260 sec + 13,085,012,778 cycles # 2.644 GHz + 39,592,390,137 instructions # 3.03 insn per cycle + 4.950371272 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198141122E-004 +Relative difference = 2.837299079287849e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.807824e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.823601e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.823601e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.106755 sec + 5,651,241,480 cycles # 2.678 GHz + 13,863,632,897 instructions # 2.45 insn per cycle + 2.110867653 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.771177e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.791107e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.791107e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.876075 sec + 5,022,531,784 cycles # 2.673 GHz + 12,559,680,227 instructions # 2.50 insn per cycle + 1.880203925 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.686685e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.698350e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.698350e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.459028 sec + 4,208,203,803 cycles # 1.709 GHz + 6,429,086,120 instructions # 1.53 insn per cycle + 2.463275806 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index bc0987eea5..afbbcacb7a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,272 +10,216 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:39:37 +DATE: 2025-10-11_16:31:19 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.954093e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.263620e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.263620e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.849435e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.385880e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.385880e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.526732 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,227,837,882 cycles # 2.883 GHz - 3,476,505,124 instructions # 1.56 insn per cycle - 0.832118305 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +TOTAL : 0.489334 sec + 2,114,311,442 cycles # 2.842 GHz + 3,127,238,641 instructions # 1.48 insn per cycle + 0.800689166 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.643761e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.124122e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.124122e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.301805 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 10,501,615,955 cycles # 2.935 GHz - 23,489,948,913 instructions # 2.24 insn per cycle - 3.634545913 seconds time elapsed +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.879294e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.880182e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.880182e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.851000e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.851887e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.851887e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.737845 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 25,658,913,414 cycles # 2.936 GHz - 78,963,594,343 instructions # 3.08 insn per cycle - 8.742435740 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.871032 sec + 25,693,998,933 cycles # 2.896 GHz + 78,573,360,631 instructions # 3.06 insn per cycle + 8.875307913 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.518464e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.521735e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.521735e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.388018e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.391044e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.391044e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.671849 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 13,102,544,659 cycles # 2.802 GHz - 39,572,381,519 instructions # 3.02 insn per cycle - 4.676455621 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.851540 sec + 13,088,956,582 cycles # 2.696 GHz + 39,603,859,010 instructions # 3.03 insn per cycle + 4.856264549 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.057114e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.073561e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.073561e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.795496e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.810972e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.810972e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.046600 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 5,627,314,455 cycles # 2.744 GHz - 13,834,298,777 instructions # 2.46 insn per cycle - 2.051219882 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11520) (512y: 0) (512z: 0) +TOTAL : 2.115018 sec + 5,684,762,872 cycles # 2.683 GHz + 13,871,040,440 instructions # 2.44 insn per cycle + 2.119380961 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.239341e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.261385e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.261385e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.855184e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.876301e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.876301e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.786219 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,931,565,389 cycles # 2.756 GHz - 12,515,991,121 instructions # 2.54 insn per cycle - 1.790909503 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10439) (512y: 89) (512z: 0) +TOTAL : 1.862992 sec + 5,028,827,648 cycles # 2.694 GHz + 12,567,491,832 instructions # 2.50 insn per cycle + 1.867563931 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.038188e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.051446e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.051446e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.712981e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.724915e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.724915e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.341272 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,150,945,217 cycles # 1.770 GHz - 6,403,675,117 instructions # 1.54 insn per cycle - 2.345955468 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1978) (512y: 101) (512z: 9386) +TOTAL : 2.454832 sec + 4,213,905,835 cycles # 1.714 GHz + 6,436,340,551 instructions # 1.53 insn per cycle + 2.459274611 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index be31042fc1..d4d5e2b45e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:50:33 +DATE: 2025-10-11_16:44:57 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.314159e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.339458e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.341417e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.369462e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.419383e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.422637e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.518126 sec -INFO: No Floating Point Exceptions have been reported - 2,164,802,026 cycles # 2.881 GHz - 3,409,915,390 instructions # 1.58 insn per cycle - 0.811338657 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.487281 sec + 2,090,605,611 cycles # 2.842 GHz + 3,063,541,899 instructions # 1.47 insn per cycle + 0.797172689 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134613e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.165487e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.166746e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.140406 sec -INFO: No Floating Point Exceptions have been reported - 9,973,053,404 cycles # 2.934 GHz - 20,986,544,572 instructions # 2.10 insn per cycle - 3.455765313 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.884135e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.885033e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.885033e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.849332e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.850241e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.850241e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.712529 sec -INFO: No Floating Point Exceptions have been reported - 25,691,717,185 cycles # 2.948 GHz - 78,960,325,856 instructions # 3.07 insn per cycle - 8.716734440 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.876225 sec + 25,662,776,506 cycles # 2.890 GHz + 78,567,147,731 instructions # 3.06 insn per cycle + 8.880187224 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.543458e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.546697e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.546697e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.358067e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.361108e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.361108e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.636367 sec -INFO: No Floating Point Exceptions have been reported - 13,067,183,546 cycles # 2.816 GHz - 39,558,454,763 instructions # 3.03 insn per cycle - 4.640590687 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.892312 sec + 13,068,286,128 cycles # 2.669 GHz + 39,590,526,259 instructions # 3.03 insn per cycle + 4.896571237 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.084806e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.101064e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.101064e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.827564e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.843333e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.843333e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.036679 sec -INFO: No Floating Point Exceptions have been reported - 5,613,470,524 cycles # 2.752 GHz - 13,823,796,455 instructions # 2.46 insn per cycle - 2.040900437 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11520) (512y: 0) (512z: 0) +TOTAL : 2.103410 sec + 5,668,034,580 cycles # 2.691 GHz + 13,860,472,796 instructions # 2.45 insn per cycle + 2.107462678 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.198723e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.219905e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.219905e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.833416e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.853413e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.853413e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.791160 sec -INFO: No Floating Point Exceptions have been reported - 4,922,288,820 cycles # 2.743 GHz - 12,503,388,745 instructions # 2.54 insn per cycle - 1.795321275 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10439) (512y: 89) (512z: 0) +TOTAL : 1.864637 sec + 5,021,320,374 cycles # 2.689 GHz + 12,554,612,891 instructions # 2.50 insn per cycle + 1.868702414 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.975365e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.987686e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.987686e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.674295e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.686265e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.686265e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.359532 sec -INFO: No Floating Point Exceptions have been reported - 4,155,009,705 cycles # 1.759 GHz - 6,390,945,346 instructions # 1.54 insn per cycle - 2.363732897 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1978) (512y: 101) (512z: 9386) +TOTAL : 2.465332 sec + 4,203,800,820 cycles # 1.703 GHz + 6,422,604,226 instructions # 1.53 insn per cycle + 2.469400350 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 437b6b7cbd..2beaf322b6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:47:41 +DATE: 2025-10-11_16:41:27 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.310053e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.334627e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.336677e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.390277e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.431631e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.434858e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.518612 sec -INFO: No Floating Point Exceptions have been reported - 2,156,837,380 cycles # 2.875 GHz - 3,433,389,555 instructions # 1.59 insn per cycle - 0.811650542 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.485227 sec + 2,088,179,344 cycles # 2.833 GHz + 3,069,782,317 instructions # 1.47 insn per cycle + 0.797220882 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.128944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.159258e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.160487e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.091523 sec -INFO: No Floating Point Exceptions have been reported - 9,825,563,648 cycles # 2.933 GHz - 22,802,776,931 instructions # 2.32 insn per cycle - 3.405923259 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.890035e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.890938e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.890938e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.841686e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.842564e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.842564e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.683864 sec -INFO: No Floating Point Exceptions have been reported - 25,635,022,031 cycles # 2.951 GHz - 78,960,809,140 instructions # 3.08 insn per cycle - 8.688143049 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.911703 sec + 25,672,385,298 cycles # 2.880 GHz + 78,567,422,772 instructions # 3.06 insn per cycle + 8.915910048 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.535619e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.538805e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.538805e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.377610e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.380670e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.380670e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.644682 sec -INFO: No Floating Point Exceptions have been reported - 13,070,212,228 cycles # 2.812 GHz - 39,558,910,913 instructions # 3.03 insn per cycle - 4.648863484 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.861995 sec + 13,083,483,284 cycles # 2.689 GHz + 39,590,790,279 instructions # 3.03 insn per cycle + 4.866021467 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.974136e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.989764e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.989764e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.782247e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.797307e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.797307e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.062978 sec -INFO: No Floating Point Exceptions have been reported - 5,609,565,523 cycles # 2.715 GHz - 13,823,736,601 instructions # 2.46 insn per cycle - 2.067208066 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11520) (512y: 0) (512z: 0) +TOTAL : 2.113995 sec + 5,648,509,407 cycles # 2.668 GHz + 13,860,950,299 instructions # 2.45 insn per cycle + 2.118130954 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.256862e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.278276e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.278276e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.815640e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.835781e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.835781e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.778135 sec -INFO: No Floating Point Exceptions have been reported - 4,913,104,520 cycles # 2.758 GHz - 12,505,156,898 instructions # 2.55 insn per cycle - 1.782374042 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10439) (512y: 89) (512z: 0) +TOTAL : 1.866689 sec + 5,013,333,127 cycles # 2.681 GHz + 12,556,528,301 instructions # 2.50 insn per cycle + 1.870730508 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.040533e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.053211e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.053211e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.601628e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.612890e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.612890e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.335968 sec -INFO: No Floating Point Exceptions have been reported - 4,137,289,106 cycles # 1.769 GHz - 6,392,511,975 instructions # 1.55 insn per cycle - 2.340416062 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1978) (512y: 101) (512z: 9386) +TOTAL : 2.490563 sec + 4,200,883,402 cycles # 1.685 GHz + 6,425,171,149 instructions # 1.53 insn per cycle + 2.494555434 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt new file mode 100644 index 0000000000..2815ba1af8 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt @@ -0,0 +1,223 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasNoBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasNoBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:50:33 + +HASBLAS=hasNoBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.400466e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.444219e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.447053e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.504359 sec + 2,085,179,396 cycles # 2.830 GHz + 3,096,904,235 instructions # 1.49 insn per cycle + 0.798389923 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626675e-04 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.851668e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.852556e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.852556e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.863632 sec + 25,676,607,785 cycles # 2.896 GHz + 78,566,655,326 instructions # 3.06 insn per cycle + 8.867760313 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198141133E-004 +Relative difference = 2.8372990776517314e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.364733e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.367766e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367766e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.880672 sec + 13,087,360,743 cycles # 2.680 GHz + 39,590,709,537 instructions # 3.03 insn per cycle + 4.884841575 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198141122E-004 +Relative difference = 2.837299079287849e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.891642e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.907720e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.907720e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.084604 sec + 5,646,655,758 cycles # 2.704 GHz + 13,860,514,996 instructions # 2.45 insn per cycle + 2.088799789 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.832886e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.853061e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.853061e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.862981 sec + 5,001,186,272 cycles # 2.680 GHz + 12,556,644,714 instructions # 2.51 insn per cycle + 1.867187074 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.594055e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.605629e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.605629e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.493451 sec + 4,195,828,592 cycles # 1.681 GHz + 6,424,665,239 instructions # 1.53 insn per cycle + 2.497646028 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index f2b15e4b6f..0158323c78 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:44:55 +DATE: 2025-10-11_16:38:00 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.041462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.325366e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.327398e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.928428e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.433382e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.436767e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.520118 sec -INFO: No Floating Point Exceptions have been reported - 2,177,158,293 cycles # 2.891 GHz - 3,464,316,990 instructions # 1.59 insn per cycle - 0.812097316 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.486860 sec + 2,086,798,241 cycles # 2.826 GHz + 3,070,254,605 instructions # 1.47 insn per cycle + 0.797700561 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst OMP= +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.734798e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.174453e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.175668e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.213650 sec -INFO: No Floating Point Exceptions have been reported - 10,150,922,529 cycles # 2.918 GHz - 23,231,659,490 instructions # 2.29 insn per cycle - 3.538737264 seconds time elapsed +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.885407e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.886309e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.886309e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.846748e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.847641e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.847641e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.705137 sec -INFO: No Floating Point Exceptions have been reported - 25,650,530,800 cycles # 2.946 GHz - 78,960,008,246 instructions # 3.08 insn per cycle - 8.709419634 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.887132 sec + 25,658,141,408 cycles # 2.886 GHz + 78,568,113,694 instructions # 3.06 insn per cycle + 8.891273835 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.551750e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.554937e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.554937e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.370014e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.373021e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.373021e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.623453 sec -INFO: No Floating Point Exceptions have been reported - 13,056,946,389 cycles # 2.822 GHz - 39,559,090,760 instructions # 3.03 insn per cycle - 4.627712527 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.872933 sec + 13,079,305,653 cycles # 2.683 GHz + 39,591,036,555 instructions # 3.03 insn per cycle + 4.877066552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.090893e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.106933e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.106933e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.876108e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.892295e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.892295e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.033338 sec -INFO: No Floating Point Exceptions have been reported - 5,609,780,879 cycles # 2.754 GHz - 13,824,722,765 instructions # 2.46 insn per cycle - 2.037509617 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11520) (512y: 0) (512z: 0) +TOTAL : 2.088702 sec + 5,640,399,522 cycles # 2.696 GHz + 13,860,298,624 instructions # 2.46 insn per cycle + 2.092763612 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.188897e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.209893e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.209893e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.890465e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.910782e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.910782e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.791081 sec -INFO: No Floating Point Exceptions have been reported - 4,916,057,270 cycles # 2.740 GHz - 12,505,186,935 instructions # 2.54 insn per cycle - 1.795355106 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10439) (512y: 89) (512z: 0) +TOTAL : 1.851027 sec + 4,999,453,261 cycles # 2.696 GHz + 12,556,321,373 instructions # 2.51 insn per cycle + 1.855011471 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.019116e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.031683e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.031683e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.623877e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.635346e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.635346e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.343107 sec -INFO: No Floating Point Exceptions have been reported - 4,136,898,273 cycles # 1.763 GHz - 6,392,336,539 instructions # 1.55 insn per cycle - 2.347534329 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1978) (512y: 101) (512z: 9386) +TOTAL : 2.482437 sec + 4,198,161,225 cycles # 1.689 GHz + 6,424,537,434 instructions # 1.53 insn per cycle + 2.486588561 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 99e413a8a3..f41a7b9938 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:03:14 +DATE: 2025-10-11_15:24:03 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.332738e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.357821e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.359802e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.429377e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.477740e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.480923e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.542209 sec -INFO: No Floating Point Exceptions have been reported - 2,220,139,727 cycles # 2.875 GHz - 3,465,138,857 instructions # 1.56 insn per cycle - 0.835706398 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.500889 sec + 2,161,311,557 cycles # 2.855 GHz + 3,140,076,215 instructions # 1.45 insn per cycle + 0.823418290 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.145716e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.176488e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.177708e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.039240 sec -INFO: No Floating Point Exceptions have been reported - 9,630,090,535 cycles # 2.918 GHz - 21,945,170,652 instructions # 2.28 insn per cycle - 3.356721463 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.881580e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.882499e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.882499e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.849400e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.850323e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.850323e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.723377 sec -INFO: No Floating Point Exceptions have been reported - 25,611,709,249 cycles # 2.935 GHz - 78,703,444,126 instructions # 3.07 insn per cycle - 8.727502935 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4191) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.874198 sec + 25,611,778,767 cycles # 2.885 GHz + 78,652,591,485 instructions # 3.07 insn per cycle + 8.878147244 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.593581e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.596889e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.596889e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.379484e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.382464e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.382464e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.571814 sec -INFO: No Floating Point Exceptions have been reported - 13,039,592,628 cycles # 2.851 GHz - 39,453,086,877 instructions # 3.03 insn per cycle - 4.575893049 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12966) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.859162 sec + 13,089,109,626 cycles # 2.692 GHz + 39,515,404,087 instructions # 3.02 insn per cycle + 4.863216879 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13022) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.986878e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.003760e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.003760e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.837369e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.853285e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.853285e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.061484 sec -INFO: No Floating Point Exceptions have been reported - 5,673,128,561 cycles # 2.749 GHz - 13,911,820,426 instructions # 2.45 insn per cycle - 2.066505881 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11582) (512y: 0) (512z: 0) +TOTAL : 2.098643 sec + 5,677,190,930 cycles # 2.701 GHz + 13,961,575,914 instructions # 2.46 insn per cycle + 2.102810449 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11630) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.098916e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.119150e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.119150e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.705091e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.724821e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.724821e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.809563 sec -INFO: No Floating Point Exceptions have been reported - 4,990,015,585 cycles # 2.753 GHz - 12,604,471,256 instructions # 2.53 insn per cycle - 1.813650628 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10423) (512y: 241) (512z: 0) +TOTAL : 1.889961 sec + 5,055,738,073 cycles # 2.670 GHz + 12,659,664,704 instructions # 2.50 insn per cycle + 1.894052230 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10483) (512y: 226) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.910207e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.922434e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.922434e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.677757e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.689492e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.689492e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.380650 sec -INFO: No Floating Point Exceptions have been reported - 4,192,440,259 cycles # 1.759 GHz - 6,502,191,985 instructions # 1.55 insn per cycle - 2.384674618 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1754) (512y: 193) (512z: 9382) +TOTAL : 2.462163 sec + 4,206,188,103 cycles # 1.706 GHz + 6,542,388,485 instructions # 1.56 insn per cycle + 2.466313710 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1764) (512y: 185) (512z: 9379) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 76362e2777..b05fc67f3a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:30:00 +DATE: 2025-10-11_16:20:09 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.108959e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.129301e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.130870e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.059658e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.097347e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.099827e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.545749 sec -INFO: No Floating Point Exceptions have been reported - 2,205,865,001 cycles # 2.840 GHz - 3,412,138,367 instructions # 1.55 insn per cycle - 0.835130533 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.501512 sec + 2,120,097,032 cycles # 2.815 GHz + 3,067,817,522 instructions # 1.45 insn per cycle + 0.823770320 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.747537e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.771352e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.772362e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.317305 sec -INFO: No Floating Point Exceptions have been reported - 10,470,225,400 cycles # 2.928 GHz - 22,893,642,046 instructions # 2.19 insn per cycle - 3.632348979 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 Avg ME (F77/GPU) = 6.6266731198158122E-004 Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.279433e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.279917e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.279917e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.202543e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.203008e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.203008e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 38.330200 sec -INFO: No Floating Point Exceptions have been reported - 112,786,835,820 cycles # 2.943 GHz - 144,812,254,859 instructions # 1.28 insn per cycle - 38.334547107 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:21273) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 39.031219 sec + 112,588,276,317 cycles # 2.885 GHz + 142,621,877,493 instructions # 1.27 insn per cycle + 39.035229334 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20355) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.132336e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.134792e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.134792e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.909352e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.911559e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.911559e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.242571 sec -INFO: No Floating Point Exceptions have been reported - 14,761,048,074 cycles # 2.814 GHz - 37,609,615,991 instructions # 2.55 insn per cycle - 5.246531710 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:68172) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.643908 sec + 15,024,056,162 cycles # 2.661 GHz + 37,385,323,408 instructions # 2.49 insn per cycle + 5.648271623 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:67523) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141209E-004 Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.367426e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.381363e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.381363e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.457222e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.471736e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.471736e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.233268 sec -INFO: No Floating Point Exceptions have been reported - 6,121,196,467 cycles # 2.737 GHz - 13,054,881,187 instructions # 2.13 insn per cycle - 2.237420808 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46946) (512y: 0) (512z: 0) +TOTAL : 2.205981 sec + 5,946,476,110 cycles # 2.692 GHz + 12,809,216,170 instructions # 2.15 insn per cycle + 2.210041352 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45792) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.964974e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.985321e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.985321e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.156302e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.178569e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.178569e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.836637 sec -INFO: No Floating Point Exceptions have been reported - 5,064,709,437 cycles # 2.753 GHz - 11,452,008,336 instructions # 2.26 insn per cycle - 1.840705951 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40486) (512y: 285) (512z: 0) +TOTAL : 1.797567 sec + 4,817,758,417 cycles # 2.675 GHz + 11,422,908,794 instructions # 2.37 insn per cycle + 1.801731550 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40102) (512y: 282) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.358991e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.372760e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.372760e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.936851e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.949204e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.949204e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.235964 sec -INFO: No Floating Point Exceptions have been reported - 3,956,538,826 cycles # 1.767 GHz - 5,928,749,634 instructions # 1.50 insn per cycle - 2.240037452 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2444) (512y: 337) (512z:39338) +TOTAL : 2.370929 sec + 4,028,743,609 cycles # 1.697 GHz + 5,966,081,307 instructions # 1.48 insn per cycle + 2.375198937 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2453) (512y: 337) (512z:39235) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 5040f4b335..10c6792da9 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:31:09 +DATE: 2025-10-11_16:21:27 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.107076e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.130192e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.131670e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.079972e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.118608e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.121448e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.539226 sec -INFO: No Floating Point Exceptions have been reported - 2,240,615,938 cycles # 2.902 GHz - 3,467,491,001 instructions # 1.55 insn per cycle - 0.828466018 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.505348 sec + 2,147,536,542 cycles # 2.834 GHz + 3,073,502,942 instructions # 1.43 insn per cycle + 0.816880103 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.751881e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.775679e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.776668e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.303070 sec -INFO: No Floating Point Exceptions have been reported - 10,434,569,638 cycles # 2.930 GHz - 24,118,235,140 instructions # 2.31 insn per cycle - 3.617886016 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 Avg ME (F77/GPU) = 6.6266731198158122E-004 Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.241409e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.241886e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.241886e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.177605e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.178066e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178066e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 38.674103 sec -INFO: No Floating Point Exceptions have been reported - 113,958,477,984 cycles # 2.947 GHz - 144,286,195,418 instructions # 1.27 insn per cycle - 38.678088373 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:21024) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 39.263371 sec + 113,104,353,359 cycles # 2.881 GHz + 142,499,000,297 instructions # 1.26 insn per cycle + 39.267518963 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140450E-004 -Relative difference = 2.83729918072716e-07 +Avg ME (F77/C++) = 6.6266731198140461E-004 +Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.007169e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.009483e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.009483e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.978578e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.980900e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.980900e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.460584 sec -INFO: No Floating Point Exceptions have been reported - 15,281,187,875 cycles # 2.797 GHz - 37,839,169,102 instructions # 2.48 insn per cycle - 5.464853538 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:68594) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.512347 sec + 14,738,984,303 cycles # 2.672 GHz + 37,383,415,891 instructions # 2.54 insn per cycle + 5.516366576 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:67498) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141209E-004 Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.567317e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.582163e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.582163e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.475575e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.489872e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.489872e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.174218 sec -INFO: No Floating Point Exceptions have been reported - 6,020,206,289 cycles # 2.765 GHz - 12,923,983,464 instructions # 2.15 insn per cycle - 2.178219828 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46048) (512y: 0) (512z: 0) +TOTAL : 2.200089 sec + 5,900,324,656 cycles # 2.678 GHz + 12,761,113,056 instructions # 2.16 insn per cycle + 2.204163616 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45170) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.900478e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.920792e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.920792e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.197126e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.219484e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.219484e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.849478 sec -INFO: No Floating Point Exceptions have been reported - 5,102,330,026 cycles # 2.754 GHz - 11,453,366,172 instructions # 2.24 insn per cycle - 1.853513717 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40151) (512y: 219) (512z: 0) +TOTAL : 1.789159 sec + 4,800,966,323 cycles # 2.679 GHz + 11,387,516,470 instructions # 2.37 insn per cycle + 1.793280010 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:39634) (512y: 220) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.368242e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.382314e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.382314e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.918624e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.931258e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.931258e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.232876 sec -INFO: No Floating Point Exceptions have been reported - 3,951,515,189 cycles # 1.767 GHz - 5,896,746,544 instructions # 1.49 insn per cycle - 2.236852257 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1959) (512y: 259) (512z:38977) +TOTAL : 2.376650 sec + 4,022,990,522 cycles # 1.691 GHz + 5,935,742,762 instructions # 1.48 insn per cycle + 2.380804465 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1962) (512y: 259) (512z:38890) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling new file mode 100644 index 0000000000..66df8ea815 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:43:39 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +4.135255e+05 1 256 +5.793061e+05 2 256 +6.367973e+05 4 256 +7.358963e+05 8 256 +7.953962e+05 16 256 +8.026621e+05 32 256 +8.113874e+05 64 256 +8.126232e+05 128 256 +8.151724e+05 256 256 +8.388200e+05 512 256 +8.795025e+05 1024 256 +### GPU: scaling test 32 +5.987397e+04 1 32 +1.082531e+05 2 32 +2.101123e+05 4 32 +2.737883e+05 8 32 +5.126747e+05 16 32 +6.967787e+05 32 32 +7.376223e+05 64 32 +7.871564e+05 128 32 +8.121480e+05 256 32 +8.130411e+05 512 32 +8.134619e+05 1024 32 +8.204307e+05 2048 32 +8.423180e+05 4096 32 +8.883516e+05 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.920624e+03 1 256 +1.925794e+03 2 256 +1.919663e+03 4 256 +### CPU: scaling test 32 +1.889651e+03 1 32 +1.920077e+03 2 32 +1.912129e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.748798e+03 1 256 +6.810960e+03 2 256 +6.802786e+03 4 256 +### CPU: scaling test 32 +6.554707e+03 1 32 +6.688739e+03 2 32 +6.725225e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.524095e+04 1 256 +1.526644e+04 2 256 +1.569761e+04 4 256 +### CPU: scaling test 32 +1.566123e+04 1 32 +1.560506e+04 2 32 +1.523576e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.747918e+04 1 256 +1.758742e+04 2 256 +1.773825e+04 4 256 +### CPU: scaling test 32 +1.691546e+04 1 32 +1.701187e+04 2 32 +1.740175e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.350824e+04 1 256 +1.356994e+04 2 256 +1.370361e+04 4 256 +### CPU: scaling test 32 +1.321355e+04 1 32 +1.322154e+04 2 32 +1.321729e+04 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index c4676334b0..edf11bdd4c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:04:57 +DATE: 2025-10-11_15:26:12 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.476973e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.519601e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.523500e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.969754e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.061645e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.069860e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.498075 sec -INFO: No Floating Point Exceptions have been reported - 2,049,620,143 cycles # 2.856 GHz - 3,058,097,989 instructions # 1.49 insn per cycle - 0.977244524 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.480574 sec + 2,060,773,811 cycles # 2.817 GHz + 2,941,122,949 instructions # 1.43 insn per cycle + 0.791153613 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.124860e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.187008e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.189727e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.797790 sec -INFO: No Floating Point Exceptions have been reported - 5,916,497,978 cycles # 2.910 GHz - 12,115,730,956 instructions # 2.05 insn per cycle - 2.090370837 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262667672387088E-004 -Relative difference = 2.825534762507892e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.932981e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.933931e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.933931e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.903278e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.904203e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904203e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.490769 sec -INFO: No Floating Point Exceptions have been reported - 24,922,868,630 cycles # 2.935 GHz - 79,110,265,707 instructions # 3.17 insn per cycle - 8.496015758 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.622014 sec + 25,008,733,138 cycles # 2.900 GHz + 79,110,262,561 instructions # 3.16 insn per cycle + 8.625952005 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863312764526E-004 -Relative difference = 4.998523613136231e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.975543e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.988298e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.988298e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.866781e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.879439e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.879439e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.356100 sec -INFO: No Floating Point Exceptions have been reported - 6,536,263,436 cycles # 2.771 GHz - 20,271,266,485 instructions # 3.10 insn per cycle - 2.362378155 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.393369 sec + 6,521,051,461 cycles # 2.721 GHz + 20,285,887,455 instructions # 3.11 insn per cycle + 2.397558323 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861442972011E-004 Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.588631e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.595153e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.595153e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.574802e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.581515e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.581515e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.038490 sec -INFO: No Floating Point Exceptions have been reported - 2,837,721,779 cycles # 2.726 GHz - 7,066,858,765 instructions # 2.49 insn per cycle - 1.044464831 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12055) (512y: 0) (512z: 0) +TOTAL : 1.046468 sec + 2,851,964,901 cycles # 2.717 GHz + 7,084,391,235 instructions # 2.48 insn per cycle + 1.050530428 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174396888E-004 Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.762421e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.770702e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.770702e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.745784e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.753552e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.753552e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.936394 sec -INFO: No Floating Point Exceptions have been reported - 2,577,125,275 cycles # 2.745 GHz - 6,404,206,024 instructions # 2.49 insn per cycle - 0.941322355 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11019) (512y: 44) (512z: 0) +TOTAL : 0.944326 sec + 2,540,352,407 cycles # 2.681 GHz + 6,429,340,698 instructions # 2.53 insn per cycle + 0.948183906 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174396888E-004 Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.409980e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.415034e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.415034e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.337094e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.341815e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.341815e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.170914 sec -INFO: No Floating Point Exceptions have been reported - 2,069,436,546 cycles # 1.766 GHz - 3,304,699,013 instructions # 1.60 insn per cycle - 1.174781391 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2603) (512y: 44) (512z: 9605) +TOTAL : 1.231615 sec + 2,100,593,891 cycles # 1.701 GHz + 3,321,026,364 instructions # 1.58 insn per cycle + 1.235667181 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779718007E-004 Relative difference = 4.194411063934945e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..ef0c8bca55 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:00:32 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.335389e+05 1 256 +3.586592e+05 2 256 +4.818891e+05 4 256 +5.593817e+05 8 256 +6.056925e+05 16 256 +6.276955e+05 32 256 +6.367619e+05 64 256 +6.473110e+05 128 256 +6.476010e+05 256 256 +6.505009e+05 512 256 +6.687069e+05 1024 256 +### GPU: scaling test 32 +3.216908e+04 1 32 +6.168033e+04 2 32 +1.180476e+05 4 32 +1.918642e+05 8 32 +3.068465e+05 16 32 +4.811781e+05 32 32 +5.662467e+05 64 32 +6.060356e+05 128 32 +6.424836e+05 256 32 +6.336577e+05 512 32 +6.477611e+05 1024 32 +6.516195e+05 2048 32 +6.509793e+05 4096 32 +6.718523e+05 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.906133e+03 1 256 +1.895289e+03 2 256 +1.894897e+03 4 256 +### CPU: scaling test 32 +1.889460e+03 1 32 +1.885630e+03 2 32 +1.887908e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.645424e+03 1 256 +6.741425e+03 2 256 +6.801857e+03 4 256 +### CPU: scaling test 32 +6.523685e+03 1 32 +6.609563e+03 2 32 +6.739293e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.544354e+04 1 256 +1.568938e+04 2 256 +1.565635e+04 4 256 +### CPU: scaling test 32 +1.473739e+04 1 32 +1.556619e+04 2 32 +1.562139e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.746432e+04 1 256 +1.767402e+04 2 256 +1.746961e+04 4 256 +### CPU: scaling test 32 +1.748124e+04 1 32 +1.594924e+04 2 32 +1.708084e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.329941e+04 1 256 +1.349011e+04 2 256 +1.344081e+04 4 256 +### CPU: scaling test 32 +1.333268e+04 1 32 +1.314999e+04 2 32 +1.325747e+04 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt new file mode 100644 index 0000000000..701efdbc30 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt @@ -0,0 +1,223 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:54:02 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.311490e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.371404e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.377432e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 1.171779 sec + 4,342,560,419 cycles # 2.834 GHz + 5,966,664,550 instructions # 1.37 insn per cycle + 1.591397840 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262664623572415E-004 +Relative difference = 2.8452263353202596e-05 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.892352e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893287e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893287e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.671691 sec + 25,006,063,904 cycles # 2.883 GHz + 79,110,972,034 instructions # 3.16 insn per cycle + 8.675650420 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.783736e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.796482e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.796482e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.422556 sec + 6,525,728,187 cycles # 2.691 GHz + 20,285,987,046 instructions # 3.11 insn per cycle + 2.426471276 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.560871e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.567340e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.567340e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.055589 sec + 2,850,961,292 cycles # 2.692 GHz + 7,084,449,005 instructions # 2.48 insn per cycle + 1.059632714 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.733304e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.741477e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.741477e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.951122 sec + 2,540,771,004 cycles # 2.663 GHz + 6,429,427,589 instructions # 2.53 insn per cycle + 0.954962814 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.328792e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.333460e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.333460e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.239447 sec + 2,103,191,835 cycles # 1.693 GHz + 3,321,146,945 instructions # 1.58 insn per cycle + 1.243442238 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index dec260c3af..33e9172b7c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,272 +10,216 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:40:11 +DATE: 2025-10-11_16:32:02 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.924368e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.456718e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.456718e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.481369 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,011,468,293 cycles # 2.883 GHz - 2,972,689,221 instructions # 1.48 insn per cycle - 0.755097926 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +EvtsPerSec[Rmb+ME] (23) = ( 6.861766e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.949922e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.949922e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048177e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.468518 sec + 2,012,803,026 cycles # 2.822 GHz + 2,875,965,208 instructions # 1.43 insn per cycle + 0.770453877 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.978465e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.128974e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.128974e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641709e+00 +- 4.994248e+00 ) GeV^-4 -TOTAL : 1.967107 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,502,759,539 cycles # 2.928 GHz - 13,854,302,325 instructions # 2.13 insn per cycle - 2.276466534 seconds time elapsed +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262667672387088E-004 -Relative difference = 2.825534762507892e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.944212e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.945160e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.945160e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.893203e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.894136e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.894136e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.443358 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 24,934,407,175 cycles # 2.952 GHz - 79,115,502,595 instructions # 3.17 insn per cycle - 8.447759712 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.670365 sec + 25,029,663,251 cycles # 2.886 GHz + 79,116,596,499 instructions # 3.16 insn per cycle + 8.674407204 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863312764526E-004 -Relative difference = 4.998523613136231e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.020230e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.033459e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.033459e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.709216e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.721522e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.721522e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.344217 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,541,090,448 cycles # 2.786 GHz - 20,280,124,954 instructions # 3.10 insn per cycle - 2.348689069 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.452506 sec + 6,536,185,486 cycles # 2.662 GHz + 20,295,453,995 instructions # 3.11 insn per cycle + 2.456555328 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861442972011E-004 Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.604920e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.611581e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.611581e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.562296e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.568810e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.568810e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.029784 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,846,767,262 cycles # 2.755 GHz - 7,076,446,064 instructions # 2.49 insn per cycle - 1.034215836 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12055) (512y: 0) (512z: 0) +TOTAL : 1.057576 sec + 2,861,881,138 cycles # 2.697 GHz + 7,094,482,774 instructions # 2.48 insn per cycle + 1.061902735 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174396888E-004 Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.797566e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.806224e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.806224e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.759096e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.767108e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.767108e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.920078 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,539,792,408 cycles # 2.749 GHz - 6,413,266,409 instructions # 2.53 insn per cycle - 0.924434981 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11019) (512y: 44) (512z: 0) +TOTAL : 0.940293 sec + 2,550,431,948 cycles # 2.703 GHz + 6,439,393,273 instructions # 2.52 insn per cycle + 0.944425361 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174396888E-004 Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.411104e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.416189e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.416189e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.351978e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.356813e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.356813e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.170311 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,078,956,436 cycles # 1.771 GHz - 3,314,205,136 instructions # 1.59 insn per cycle - 1.174679954 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2603) (512y: 44) (512z: 9605) +TOTAL : 1.220874 sec + 2,108,458,958 cycles # 1.722 GHz + 3,331,332,180 instructions # 1.58 insn per cycle + 1.225108686 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779718007E-004 Relative difference = 4.194411063934945e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 3ebd5caeb8..2a484de798 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:51:07 +DATE: 2025-10-11_16:45:41 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.481675e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.521755e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.525865e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.975551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.068315e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.076540e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159396e-01 +- 3.238803e-01 ) GeV^-4 -TOTAL : 0.477918 sec -INFO: No Floating Point Exceptions have been reported - 1,990,228,523 cycles # 2.864 GHz - 2,978,927,673 instructions # 1.50 insn per cycle - 0.751663902 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.467991 sec + 2,005,858,911 cycles # 2.818 GHz + 2,853,662,043 instructions # 1.42 insn per cycle + 0.770358119 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.037728e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.099183e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.101846e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.886731 sec -INFO: No Floating Point Exceptions have been reported - 6,136,710,401 cycles # 2.909 GHz - 13,142,850,218 instructions # 2.14 insn per cycle - 2.175693489 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262667672387088E-004 -Relative difference = 2.825534762507892e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.941292e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.942240e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.942240e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.892862e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893799e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893799e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.455097 sec -INFO: No Floating Point Exceptions have been reported - 24,914,950,228 cycles # 2.946 GHz - 79,111,045,664 instructions # 3.18 insn per cycle - 8.459383915 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.670204 sec + 25,024,619,872 cycles # 2.885 GHz + 79,109,507,524 instructions # 3.16 insn per cycle + 8.674082417 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863312764526E-004 -Relative difference = 4.998523613136231e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.977213e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.990041e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.990041e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.794380e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.806787e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.806787e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.356205 sec -INFO: No Floating Point Exceptions have been reported - 6,550,546,250 cycles # 2.776 GHz - 20,269,237,886 instructions # 3.09 insn per cycle - 2.360272003 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.419819 sec + 6,522,870,130 cycles # 2.692 GHz + 20,284,313,479 instructions # 3.11 insn per cycle + 2.423616462 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861442972011E-004 Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.601317e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.608084e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.608084e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.559254e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.565757e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565757e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 1.030095 sec -INFO: No Floating Point Exceptions have been reported - 2,839,431,727 cycles # 2.748 GHz - 7,063,774,184 instructions # 2.49 insn per cycle - 1.034210988 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12055) (512y: 0) (512z: 0) +TOTAL : 1.057643 sec + 2,858,106,356 cycles # 2.694 GHz + 7,082,027,901 instructions # 2.48 insn per cycle + 1.061594009 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174396888E-004 Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.801735e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.810193e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.810193e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.732036e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.739945e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.739945e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.916264 sec -INFO: No Floating Point Exceptions have been reported - 2,529,614,240 cycles # 2.751 GHz - 6,399,972,746 instructions # 2.53 insn per cycle - 0.920311559 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11019) (512y: 44) (512z: 0) +TOTAL : 0.953431 sec + 2,543,753,776 cycles # 2.660 GHz + 6,427,635,361 instructions # 2.53 insn per cycle + 0.957126756 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174396888E-004 Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.413582e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.418711e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.418711e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.349101e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.354028e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.354028e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.166574 sec -INFO: No Floating Point Exceptions have been reported - 2,070,023,042 cycles # 1.769 GHz - 3,300,470,940 instructions # 1.59 insn per cycle - 1.170621524 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2603) (512y: 44) (512z: 9605) +TOTAL : 1.221899 sec + 2,101,668,726 cycles # 1.716 GHz + 3,317,393,025 instructions # 1.58 insn per cycle + 1.225868499 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779718007E-004 Relative difference = 4.194411063934945e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 8aa78a916d..9f5f8217b1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:48:16 +DATE: 2025-10-11_16:42:10 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.460370e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.501314e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.505347e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.971986e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.070136e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.083717e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.475676 sec -INFO: No Floating Point Exceptions have been reported - 1,998,344,168 cycles # 2.886 GHz - 3,027,104,836 instructions # 1.51 insn per cycle - 0.748859673 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.465911 sec + 2,085,649,672 cycles # 2.824 GHz + 2,853,158,366 instructions # 1.37 insn per cycle + 0.797926486 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.172168e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.234506e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.237328e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.821851 sec -INFO: No Floating Point Exceptions have been reported - 6,001,499,639 cycles # 2.924 GHz - 13,042,334,044 instructions # 2.17 insn per cycle - 2.109220847 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262667672387088E-004 -Relative difference = 2.825534762507892e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.941510e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.942442e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.942442e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.887385e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.888309e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.888309e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.452200 sec -INFO: No Floating Point Exceptions have been reported - 24,907,540,526 cycles # 2.946 GHz - 79,109,866,227 instructions # 3.18 insn per cycle - 8.456266423 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.694438 sec + 25,009,094,589 cycles # 2.876 GHz + 79,110,682,076 instructions # 3.16 insn per cycle + 8.698358258 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863312764526E-004 -Relative difference = 4.998523613136231e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.017369e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.030395e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.030395e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.786091e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.798676e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.798676e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.341887 sec -INFO: No Floating Point Exceptions have been reported - 6,533,658,672 cycles # 2.786 GHz - 20,270,788,705 instructions # 3.10 insn per cycle - 2.345994128 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.421571 sec + 6,521,561,343 cycles # 2.690 GHz + 20,285,907,872 instructions # 3.11 insn per cycle + 2.425622228 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861442972011E-004 Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.604029e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.610893e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.610893e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.544765e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.551053e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.551053e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.027451 sec -INFO: No Floating Point Exceptions have been reported - 2,836,206,155 cycles # 2.751 GHz - 7,065,988,768 instructions # 2.49 insn per cycle - 1.031531216 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12055) (512y: 0) (512z: 0) +TOTAL : 1.066479 sec + 2,853,976,312 cycles # 2.668 GHz + 7,084,427,661 instructions # 2.48 insn per cycle + 1.070436318 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174396888E-004 Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.796598e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.804847e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.804847e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.733440e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.741292e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.741292e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.917896 sec -INFO: No Floating Point Exceptions have been reported - 2,527,698,465 cycles # 2.744 GHz - 6,403,574,368 instructions # 2.53 insn per cycle - 0.921906155 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11019) (512y: 44) (512z: 0) +TOTAL : 0.951193 sec + 2,545,293,522 cycles # 2.667 GHz + 6,429,326,530 instructions # 2.53 insn per cycle + 0.955037744 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174396888E-004 Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.414079e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.419125e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.419125e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.345267e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.349883e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.349883e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.164994 sec -INFO: No Floating Point Exceptions have been reported - 2,068,678,617 cycles # 1.770 GHz - 3,304,093,166 instructions # 1.60 insn per cycle - 1.169236265 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2603) (512y: 44) (512z: 9605) +TOTAL : 1.224208 sec + 2,101,816,780 cycles # 1.713 GHz + 3,321,301,841 instructions # 1.58 insn per cycle + 1.228087953 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779718007E-004 Relative difference = 4.194411063934945e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt new file mode 100644 index 0000000000..30c823393b --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt @@ -0,0 +1,223 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasNoBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasNoBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:51:59 + +HASBLAS=hasNoBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.013258e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.103080e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.110808e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.479902 sec + 1,978,219,521 cycles # 2.831 GHz + 2,863,905,705 instructions # 1.45 insn per cycle + 0.755864012 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.898659e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.899570e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.899570e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.643023 sec + 24,998,550,241 cycles # 2.892 GHz + 79,111,084,095 instructions # 3.16 insn per cycle + 8.646984489 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.719385e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.731327e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.731327e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.445830 sec + 6,526,769,240 cycles # 2.665 GHz + 20,286,103,115 instructions # 3.11 insn per cycle + 2.449754025 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.565963e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.572237e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.572237e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.052461 sec + 2,851,588,130 cycles # 2.701 GHz + 7,084,479,012 instructions # 2.48 insn per cycle + 1.056444800 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.748496e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.756542e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.756542e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.942761 sec + 2,539,647,091 cycles # 2.684 GHz + 6,429,491,013 instructions # 2.53 insn per cycle + 0.946755867 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.348567e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.353355e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.353355e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.221456 sec + 2,102,747,652 cycles # 1.717 GHz + 3,321,271,092 instructions # 1.58 insn per cycle + 1.225405100 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 59696ff16e..b51802abeb 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:45:30 +DATE: 2025-10-11_16:38:43 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.026958e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.513975e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.517845e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.478506 sec -INFO: No Floating Point Exceptions have been reported - 1,992,355,788 cycles # 2.865 GHz - 3,027,729,409 instructions # 1.52 insn per cycle - 0.751914958 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.083410e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.111715e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.119810e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048177e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.467709 sec + 2,010,523,047 cycles # 2.824 GHz + 2,892,361,831 instructions # 1.44 insn per cycle + 0.770628946 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst OMP= +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.156008e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.226322e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.229025e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641709e+00 +- 4.994248e+00 ) GeV^-4 -TOTAL : 1.900625 sec -INFO: No Floating Point Exceptions have been reported - 6,225,372,770 cycles # 2.919 GHz - 12,616,761,411 instructions # 2.03 insn per cycle - 2.188103626 seconds time elapsed +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262667672387088E-004 -Relative difference = 2.825534762507892e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.942577e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.943527e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.943527e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.889714e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.890621e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.890621e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.447888 sec -INFO: No Floating Point Exceptions have been reported - 24,912,816,300 cycles # 2.948 GHz - 79,110,249,403 instructions # 3.18 insn per cycle - 8.452014602 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.683941 sec + 25,012,693,300 cycles # 2.880 GHz + 79,111,053,402 instructions # 3.16 insn per cycle + 8.687777898 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863312764526E-004 -Relative difference = 4.998523613136231e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.980733e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.993141e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.993141e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.774197e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.786532e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.786532e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.354354 sec -INFO: No Floating Point Exceptions have been reported - 6,535,460,807 cycles # 2.772 GHz - 20,270,869,690 instructions # 3.10 insn per cycle - 2.358646539 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.425829 sec + 6,538,669,629 cycles # 2.692 GHz + 20,286,236,268 instructions # 3.10 insn per cycle + 2.429903422 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861442972011E-004 Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.603543e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.610156e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.610156e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.538774e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.544893e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.544893e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.027888 sec -INFO: No Floating Point Exceptions have been reported - 2,837,672,612 cycles # 2.752 GHz - 7,066,358,168 instructions # 2.49 insn per cycle - 1.031930682 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12055) (512y: 0) (512z: 0) +TOTAL : 1.071044 sec + 2,851,268,280 cycles # 2.654 GHz + 7,084,649,438 instructions # 2.48 insn per cycle + 1.074854505 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174396888E-004 Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.798975e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.807399e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.807399e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.734960e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.742729e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.742729e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.916670 sec -INFO: No Floating Point Exceptions have been reported - 2,525,901,356 cycles # 2.745 GHz - 6,403,453,175 instructions # 2.54 insn per cycle - 0.920789172 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11019) (512y: 44) (512z: 0) +TOTAL : 0.950344 sec + 2,540,286,423 cycles # 2.664 GHz + 6,429,424,927 instructions # 2.53 insn per cycle + 0.954335905 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174396888E-004 Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.406582e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.411589e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.411589e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.326881e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.331538e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331538e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.171278 sec -INFO: No Floating Point Exceptions have been reported - 2,071,908,739 cycles # 1.764 GHz - 3,303,987,486 instructions # 1.59 insn per cycle - 1.175442581 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2603) (512y: 44) (512z: 9605) +TOTAL : 1.241226 sec + 2,102,177,412 cycles # 1.689 GHz + 3,321,695,580 instructions # 1.58 insn per cycle + 1.245320786 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779718007E-004 Relative difference = 4.194411063934945e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index fc006f8d57..a1ed0e1048 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:05:24 +DATE: 2025-10-11_15:26:49 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.473150e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.513248e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.516891e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.023167e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.101141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.108760e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.500476 sec -INFO: No Floating Point Exceptions have been reported - 2,066,687,911 cycles # 2.859 GHz - 3,064,980,702 instructions # 1.48 insn per cycle - 0.941605450 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.481972 sec + 2,053,644,686 cycles # 2.818 GHz + 2,906,367,138 instructions # 1.42 insn per cycle + 0.790666270 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.096999e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.159101e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.161763e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.803372 sec -INFO: No Floating Point Exceptions have been reported - 5,931,019,959 cycles # 2.909 GHz - 12,491,679,666 instructions # 2.11 insn per cycle - 2.096189929 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262667672387088E-004 -Relative difference = 2.825534762507892e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.927739e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.928675e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.928675e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.911966e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.912904e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.912904e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.512686 sec -INFO: No Floating Point Exceptions have been reported - 24,976,995,918 cycles # 2.933 GHz - 78,849,322,260 instructions # 3.16 insn per cycle - 8.521021644 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.582602 sec + 24,849,332,204 cycles # 2.895 GHz + 78,811,199,944 instructions # 3.17 insn per cycle + 8.586531797 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2999) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866250177339E-004 -Relative difference = 5.65798569465384e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863279149748E-004 +Relative difference = 4.947803358686673e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.196617e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.210064e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.210064e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.802565e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.815087e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.815087e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.283841 sec -INFO: No Floating Point Exceptions have been reported - 6,462,353,077 cycles # 2.825 GHz - 20,230,287,596 instructions # 3.13 insn per cycle - 2.291660153 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13491) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.415633 sec + 6,482,490,857 cycles # 2.680 GHz + 20,247,828,097 instructions # 3.12 insn per cycle + 2.419608944 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861448331612E-004 Relative difference = 2.1853408865157068e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.507603e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.513399e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.513399e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.493020e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.499074e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.499074e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.094262 sec -INFO: No Floating Point Exceptions have been reported - 2,977,852,840 cycles # 2.716 GHz - 7,207,139,157 instructions # 2.42 insn per cycle - 1.100869463 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12437) (512y: 0) (512z: 0) +TOTAL : 1.103256 sec + 2,994,004,582 cycles # 2.706 GHz + 7,224,670,986 instructions # 2.41 insn per cycle + 1.107361000 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12455) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271939668088170E-004 Relative difference = 5.008331292535666e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.740158e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.747960e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.747960e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.703839e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.711671e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.711671e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.947565 sec -INFO: No Floating Point Exceptions have been reported - 2,615,044,427 cycles # 2.750 GHz - 6,545,142,442 instructions # 2.50 insn per cycle - 0.954571468 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11449) (512y: 27) (512z: 0) +TOTAL : 0.967356 sec + 2,634,233,834 cycles # 2.714 GHz + 6,565,459,296 instructions # 2.49 insn per cycle + 0.971230309 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11486) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271939668088170E-004 Relative difference = 5.008331292535666e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.344321e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.349023e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.349023e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.318889e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.323344e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323344e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.225060 sec -INFO: No Floating Point Exceptions have been reported - 2,140,395,059 cycles # 1.742 GHz - 3,462,158,546 instructions # 1.62 insn per cycle - 1.232075146 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3037) (512y: 25) (512z: 9677) +TOTAL : 1.248532 sec + 2,165,605,341 cycles # 1.730 GHz + 3,476,565,175 instructions # 1.61 insn per cycle + 1.252574898 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3018) (512y: 20) (512z: 9665) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952032316561E-004 Relative difference = 3.066631594207157e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 507fa267fb..c3e94ba26d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:32:18 +DATE: 2025-10-11_16:22:45 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.570913e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.612300e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.616113e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059597e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.500062 sec -INFO: No Floating Point Exceptions have been reported - 2,077,093,809 cycles # 2.883 GHz - 3,095,482,027 instructions # 1.49 insn per cycle - 0.782648151 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.980018e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.060840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.068475e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.483472 sec + 2,078,701,556 cycles # 2.836 GHz + 2,938,258,784 instructions # 1.41 insn per cycle + 0.794272127 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.624378e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.693284e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.696098e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.736663 sec -INFO: No Floating Point Exceptions have been reported - 5,745,039,966 cycles # 2.917 GHz - 12,243,347,327 instructions # 2.13 insn per cycle - 2.029186282 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262669162351490E-004 -Relative difference = 2.8232862531213374e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262664051428000E-004 +Relative difference = 2.8460897599042618e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.610943e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.611718e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.611718e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.536396e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.537181e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.537181e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.233986 sec -INFO: No Floating Point Exceptions have been reported - 86,131,386,822 cycles # 2.946 GHz - 135,652,659,903 instructions # 1.57 insn per cycle - 29.237672033 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:15856) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 29.627851 sec + 85,239,542,827 cycles # 2.877 GHz + 134,215,968,109 instructions # 1.57 insn per cycle + 29.631730646 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:15099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275349717465765E-004 -Relative difference = 4.26303654465793e-09 +Avg ME (F77/C++) = 6.6275349049735310E-004 +Relative difference = 1.4338131648076968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.849906e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.862163e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.862163e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.562878e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.574411e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.574411e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.399244 sec -INFO: No Floating Point Exceptions have been reported - 6,757,771,203 cycles # 2.813 GHz - 19,352,943,673 instructions # 2.86 insn per cycle - 2.403059869 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:69577) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.504142 sec + 6,771,535,920 cycles # 2.701 GHz + 19,207,882,725 instructions # 2.84 insn per cycle + 2.508192424 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:68781) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274862748188362E-004 Relative difference = 4.14665283800746e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.430057e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.435326e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.435326e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.450780e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.456226e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.456226e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.151867 sec -INFO: No Floating Point Exceptions have been reported - 3,169,480,733 cycles # 2.744 GHz - 6,794,963,559 instructions # 2.14 insn per cycle - 1.155607574 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:49034) (512y: 0) (512z: 0) +TOTAL : 1.135519 sec + 3,073,910,834 cycles # 2.700 GHz + 6,671,130,394 instructions # 2.17 insn per cycle + 1.139479935 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47844) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731568543797E-004 Relative difference = 2.3668012430631962e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.731154e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.739005e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.739005e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.771981e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.780020e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.780020e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 0.952402 sec -INFO: No Floating Point Exceptions have been reported - 2,622,407,179 cycles # 2.744 GHz - 5,970,044,618 instructions # 2.28 insn per cycle - 0.956238068 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42602) (512y: 11) (512z: 0) +TOTAL : 0.930511 sec + 2,525,041,206 cycles # 2.704 GHz + 5,950,807,908 instructions # 2.36 insn per cycle + 0.934389144 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42169) (512y: 10) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731568543797E-004 Relative difference = 2.3668012430631962e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.414435e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.419474e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.419474e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.326409e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.331048e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331048e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.165045 sec -INFO: No Floating Point Exceptions have been reported - 2,067,228,248 cycles # 1.769 GHz - 3,495,098,954 instructions # 1.69 insn per cycle - 1.168981438 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5208) (512y: 3) (512z:44858) +TOTAL : 1.241611 sec + 2,116,308,082 cycles # 1.700 GHz + 3,522,579,874 instructions # 1.66 insn per cycle + 1.245792482 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5213) (512y: 3) (512z:44839) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 Avg ME (F77/C++) = 6.6272750237027223E-004 Relative difference = 3.5765412974815996e-09 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 2595c32afa..0bef615dd8 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:33:09 +DATE: 2025-10-11_16:23:46 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.573938e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.613715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.617455e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059597e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.493227 sec -INFO: No Floating Point Exceptions have been reported - 2,049,677,908 cycles # 2.879 GHz - 3,032,655,926 instructions # 1.48 insn per cycle - 0.769218706 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.071174e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.149873e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.157266e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.480187 sec + 2,056,422,141 cycles # 2.821 GHz + 2,909,868,255 instructions # 1.42 insn per cycle + 0.789769149 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.673337e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.742674e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.745488e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.731870 sec -INFO: No Floating Point Exceptions have been reported - 5,773,880,906 cycles # 2.919 GHz - 12,286,627,464 instructions # 2.13 insn per cycle - 2.034768323 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262669162351490E-004 -Relative difference = 2.8232862531213374e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262664051428000E-004 +Relative difference = 2.8460897599042618e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.600277e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.601076e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.601076e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.550689e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.551508e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.551508e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.289301 sec -INFO: No Floating Point Exceptions have been reported - 86,207,606,672 cycles # 2.943 GHz - 135,355,986,373 instructions # 1.57 insn per cycle - 29.293063672 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:15471) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 29.550873 sec + 85,210,035,482 cycles # 2.883 GHz + 134,053,525,503 instructions # 1.57 insn per cycle + 29.554932127 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:15171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275349662128086E-004 -Relative difference = 5.098002770919431e-09 +Avg ME (F77/C++) = 6.6275349729240374E-004 +Relative difference = 4.085374577342176e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.848001e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.860244e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.860244e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.704049e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.715826e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.715826e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.399823 sec -INFO: No Floating Point Exceptions have been reported - 6,855,955,670 cycles # 2.853 GHz - 19,471,788,292 instructions # 2.84 insn per cycle - 2.403723205 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:69876) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.451563 sec + 6,575,110,645 cycles # 2.679 GHz + 19,101,194,250 instructions # 2.91 insn per cycle + 2.455617178 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:68204) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274862799683282E-004 Relative difference = 4.2243518621014775e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.455129e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.460639e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.460639e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.461044e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.466509e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.466509e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.132031 sec -INFO: No Floating Point Exceptions have been reported - 3,102,391,764 cycles # 2.733 GHz - 6,715,014,781 instructions # 2.16 insn per cycle - 1.135898458 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47692) (512y: 0) (512z: 0) +TOTAL : 1.127472 sec + 3,056,173,108 cycles # 2.702 GHz + 6,654,226,606 instructions # 2.18 insn per cycle + 1.131533762 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47010) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731623419345E-004 Relative difference = 2.449603850635964e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.738588e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.746518e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.746518e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.769806e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.777757e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.777757e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 0.948137 sec -INFO: No Floating Point Exceptions have been reported - 2,626,199,962 cycles # 2.761 GHz - 5,966,019,567 instructions # 2.27 insn per cycle - 0.951931849 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41858) (512y: 13) (512z: 0) +TOTAL : 0.931579 sec + 2,522,992,718 cycles # 2.700 GHz + 5,975,076,879 instructions # 2.37 insn per cycle + 0.935429613 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41660) (512y: 11) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731623419345E-004 Relative difference = 2.449603850635964e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.414552e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.419616e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.419616e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.345570e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.350413e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.350413e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.164736 sec -INFO: No Floating Point Exceptions have been reported - 2,067,746,434 cycles # 1.771 GHz - 3,487,891,958 instructions # 1.69 insn per cycle - 1.168545250 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4171) (512y: 4) (512z:44494) +TOTAL : 1.223621 sec + 2,097,428,008 cycles # 1.710 GHz + 3,514,537,932 instructions # 1.68 insn per cycle + 1.227733047 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4173) (512y: 4) (512z:44470) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 Avg ME (F77/C++) = 6.6272750247886592E-004 Relative difference = 3.740400032174438e-09 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling new file mode 100644 index 0000000000..10d80cdca4 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:43:12 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.858419e+05 1 256 +3.745329e+05 2 256 +3.897177e+05 4 256 +4.239569e+05 8 256 +4.437166e+05 16 256 +4.444009e+05 32 256 +4.485074e+05 64 256 +4.433314e+05 128 256 +4.512938e+05 256 256 +4.568500e+05 512 256 +4.555629e+05 1024 256 +### GPU: scaling test 32 +5.657558e+04 1 32 +1.070333e+05 2 32 +1.849532e+05 4 32 +2.657280e+05 8 32 +3.949685e+05 16 32 +3.946154e+05 32 32 +4.350193e+05 64 32 +4.473966e+05 128 32 +4.519860e+05 256 32 +4.459799e+05 512 32 +4.463425e+05 1024 32 +4.512453e+05 2048 32 +4.596972e+05 4096 32 +4.567015e+05 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.832892e+03 1 256 +1.824058e+03 2 256 +1.836696e+03 4 256 +### CPU: scaling test 32 +1.828347e+03 1 32 +1.832242e+03 2 32 +1.831046e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.486552e+03 1 256 +3.490138e+03 2 256 +3.498447e+03 4 256 +### CPU: scaling test 32 +3.349673e+03 1 32 +3.424966e+03 2 32 +3.419275e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.965219e+03 1 256 +7.977523e+03 2 256 +8.081277e+03 4 256 +### CPU: scaling test 32 +7.768804e+03 1 32 +7.471564e+03 2 32 +7.954694e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.159079e+03 1 256 +9.181848e+03 2 256 +9.256886e+03 4 256 +### CPU: scaling test 32 +8.945974e+03 1 32 +8.898384e+03 2 32 +8.978221e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.830723e+03 1 256 +6.905755e+03 2 256 +6.932432e+03 4 256 +### CPU: scaling test 32 +6.653413e+03 1 32 +6.716747e+03 2 32 +6.760196e+03 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index a3a2deda6e..e3e2b43997 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:03:48 +DATE: 2025-10-11_15:24:46 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.318725e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.347238e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.349358e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.393156e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.441810e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.445057e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.539210 sec -INFO: No Floating Point Exceptions have been reported - 2,220,963,802 cycles # 2.880 GHz - 3,406,426,816 instructions # 1.53 insn per cycle - 0.832307462 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.502434 sec + 2,151,870,507 cycles # 2.842 GHz + 3,130,235,445 instructions # 1.45 insn per cycle + 0.824960007 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134167e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.165985e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.047029 sec -INFO: No Floating Point Exceptions have been reported - 9,687,290,131 cycles # 2.924 GHz - 21,862,744,253 instructions # 2.26 insn per cycle - 3.379254641 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266732376103494E-004 -Relative difference = 2.659538381540814e-07 +Avg ME (F77/GPU) = 6.6266731567731949E-004 +Relative difference = 2.781525885774229e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.868179e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.869079e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.869079e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.825164e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.826053e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.826053e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.786228 sec -INFO: No Floating Point Exceptions have been reported - 25,910,148,307 cycles # 2.949 GHz - 79,427,985,275 instructions # 3.07 insn per cycle - 8.790193498 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4775) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.992021 sec + 26,029,577,464 cycles # 2.894 GHz + 79,114,128,675 instructions # 3.04 insn per cycle + 8.996124488 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.521065e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.524381e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.524381e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.429291e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.432449e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.432449e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.666859 sec -INFO: No Floating Point Exceptions have been reported - 12,831,991,791 cycles # 2.749 GHz - 38,825,085,312 instructions # 3.03 insn per cycle - 4.671138327 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13173) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.789072 sec + 12,824,725,318 cycles # 2.676 GHz + 38,757,792,368 instructions # 3.02 insn per cycle + 4.793199776 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.087173e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.104021e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.104021e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.935628e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.953025e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.953025e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.035173 sec -INFO: No Floating Point Exceptions have been reported - 5,594,158,972 cycles # 2.744 GHz - 13,617,938,147 instructions # 2.43 insn per cycle - 2.039272194 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11427) (512y: 0) (512z: 0) +TOTAL : 2.072950 sec + 5,562,263,841 cycles # 2.679 GHz + 13,540,518,730 instructions # 2.43 insn per cycle + 2.077092697 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.329915e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.351715e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.351715e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.986204e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.007643e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.007643e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.764965 sec -INFO: No Floating Point Exceptions have been reported - 4,865,961,098 cycles # 2.752 GHz - 12,296,280,016 instructions # 2.53 insn per cycle - 1.768959352 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10331) (512y: 80) (512z: 0) +TOTAL : 1.831318 sec + 4,854,515,630 cycles # 2.646 GHz + 12,237,415,635 instructions # 2.52 insn per cycle + 1.835524858 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.944494e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.956947e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.956947e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.899014e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.911241e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.911241e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.368908 sec -INFO: No Floating Point Exceptions have been reported - 4,175,656,001 cycles # 1.761 GHz - 6,394,856,033 instructions # 1.53 insn per cycle - 2.373043514 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1983) (512y: 92) (512z: 9360) +TOTAL : 2.383753 sec + 4,111,562,734 cycles # 1.722 GHz + 6,282,557,303 instructions # 1.53 insn per cycle + 2.388073448 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..5eb0658f4e --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:59:44 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.478169e+05 1 256 +2.269338e+05 2 256 +2.908405e+05 4 256 +3.460040e+05 8 256 +3.706753e+05 16 256 +3.850253e+05 32 256 +3.834285e+05 64 256 +3.887436e+05 128 256 +3.877878e+05 256 256 +3.930166e+05 512 256 +4.044746e+05 1024 256 +### GPU: scaling test 32 +2.315019e+04 1 32 +4.199167e+04 2 32 +8.231040e+04 4 32 +1.430769e+05 8 32 +2.353840e+05 16 32 +2.941154e+05 32 32 +3.501493e+05 64 32 +3.762161e+05 128 32 +3.849858e+05 256 32 +3.843601e+05 512 32 +3.882366e+05 1024 32 +3.853348e+05 2048 32 +3.939954e+05 4096 32 +4.042764e+05 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.820929e+03 1 256 +1.819554e+03 2 256 +1.824693e+03 4 256 +### CPU: scaling test 32 +1.809922e+03 1 32 +1.818380e+03 2 32 +1.829598e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.467484e+03 1 256 +3.477201e+03 2 256 +3.483666e+03 4 256 +### CPU: scaling test 32 +3.376210e+03 1 32 +3.385787e+03 2 32 +3.462870e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.773756e+03 1 256 +7.868538e+03 2 256 +7.891583e+03 4 256 +### CPU: scaling test 32 +7.767594e+03 1 32 +7.512875e+03 2 32 +7.861406e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.905874e+03 1 256 +9.000800e+03 2 256 +9.159354e+03 4 256 +### CPU: scaling test 32 +9.007891e+03 1 32 +8.853559e+03 2 32 +8.999340e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.725095e+03 1 256 +6.926689e+03 2 256 +6.793100e+03 4 256 +### CPU: scaling test 32 +6.759773e+03 1 32 +6.705987e+03 2 32 +6.758642e+03 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt new file mode 100644 index 0000000000..8b06b13019 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt @@ -0,0 +1,223 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:53:12 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.813357e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.847839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.850325e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.193508 sec + 4,401,135,195 cycles # 2.829 GHz + 6,108,788,422 instructions # 1.39 insn per cycle + 1.613268691 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626675e-04 +Avg ME (F77/GPU) = 6.6266733778757203E-004 +Relative difference = 2.447870582934832e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.815440e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.816305e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.816305e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.040328 sec + 26,031,336,563 cycles # 2.879 GHz + 79,117,154,926 instructions # 3.04 insn per cycle + 9.044442399 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731406016235E-004 +Relative difference = 2.8059296349552523e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.427905e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.431039e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.431039e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.790651 sec + 12,832,687,294 cycles # 2.677 GHz + 38,758,106,395 instructions # 3.02 insn per cycle + 4.794734568 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730246908442E-004 +Relative difference = 2.98084507782618e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.935202e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.951558e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.951558e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.072958 sec + 5,568,085,348 cycles # 2.682 GHz + 13,540,506,751 instructions # 2.43 insn per cycle + 2.076971724 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 9.161412e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.183655e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.183655e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.796303 sec + 4,854,337,043 cycles # 2.698 GHz + 12,237,142,563 instructions # 2.52 insn per cycle + 1.800481736 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.873484e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.885441e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.885441e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.392508 sec + 4,106,170,622 cycles # 1.714 GHz + 6,282,499,145 instructions # 1.53 insn per cycle + 2.396728116 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt new file mode 100644 index 0000000000..1a693ccc02 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt @@ -0,0 +1,223 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasNoBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasNoBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:51:16 + +HASBLAS=hasNoBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.425282e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.474579e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.477977e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.505604 sec + 2,079,342,335 cycles # 2.823 GHz + 3,110,113,358 instructions # 1.50 insn per cycle + 0.804143585 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626675e-04 +Avg ME (F77/GPU) = 6.6266731567731949E-004 +Relative difference = 2.781525885774229e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.820544e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.821419e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.821419e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.014922 sec + 26,029,815,792 cycles # 2.887 GHz + 79,113,148,007 instructions # 3.04 insn per cycle + 9.018853711 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731406016235E-004 +Relative difference = 2.8059296349552523e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.422911e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.426145e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.426145e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.797700 sec + 12,826,872,860 cycles # 2.672 GHz + 38,756,601,713 instructions # 3.02 insn per cycle + 4.801871860 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730246908442E-004 +Relative difference = 2.98084507782618e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.944046e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.960023e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.960023e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.070707 sec + 5,566,396,722 cycles # 2.684 GHz + 13,540,340,017 instructions # 2.43 insn per cycle + 2.074804703 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 9.072103e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.093961e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.093961e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.814093 sec + 4,852,758,403 cycles # 2.670 GHz + 12,237,059,875 instructions # 2.52 insn per cycle + 1.818055824 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.846048e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.858465e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.858465e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.401888 sec + 4,113,800,876 cycles # 1.711 GHz + 6,282,877,511 instructions # 1.53 insn per cycle + 2.405935799 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index f598011718..55816a282e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:04:22 +DATE: 2025-10-11_15:25:29 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.335025e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.357927e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.359916e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.409960e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.457193e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.460417e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.537353 sec -INFO: No Floating Point Exceptions have been reported - 2,216,980,042 cycles # 2.869 GHz - 3,463,326,813 instructions # 1.56 insn per cycle - 0.836472238 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.500032 sec + 2,128,939,464 cycles # 2.818 GHz + 3,048,895,103 instructions # 1.43 insn per cycle + 0.815266921 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.141323e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.172030e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.173253e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.034442 sec -INFO: No Floating Point Exceptions have been reported - 9,665,974,027 cycles # 2.922 GHz - 21,248,987,108 instructions # 2.20 insn per cycle - 3.363171619 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266732376103494E-004 -Relative difference = 2.659538381540814e-07 +Avg ME (F77/GPU) = 6.6266731567731949E-004 +Relative difference = 2.781525885774229e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.862251e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.863154e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.863154e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.835004e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.835894e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.835894e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.813876 sec -INFO: No Floating Point Exceptions have been reported - 25,987,730,158 cycles # 2.948 GHz - 79,453,128,863 instructions # 3.06 insn per cycle - 8.817767368 seconds time elapsed +TOTAL : 8.943891 sec + 25,955,962,699 cycles # 2.901 GHz + 79,198,038,648 instructions # 3.05 insn per cycle + 8.947961266 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.512571e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.515785e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.515785e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.464500e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.467677e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.467677e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.675994 sec -INFO: No Floating Point Exceptions have been reported - 12,822,983,844 cycles # 2.741 GHz - 38,780,874,555 instructions # 3.02 insn per cycle - 4.681038643 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12935) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.740131 sec + 12,742,308,756 cycles # 2.686 GHz + 38,685,964,134 instructions # 3.04 insn per cycle + 4.744223175 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12933) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.056370e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.072927e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.072927e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.985627e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.001632e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.001632e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.042531 sec -INFO: No Floating Point Exceptions have been reported - 5,590,175,615 cycles # 2.733 GHz - 13,732,675,080 instructions # 2.46 insn per cycle - 2.046647326 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11510) (512y: 0) (512z: 0) +TOTAL : 2.059737 sec + 5,594,595,243 cycles # 2.712 GHz + 13,643,577,301 instructions # 2.44 insn per cycle + 2.063806863 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11479) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.148791e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.170046e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.170046e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.864560e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.884766e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.884766e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.800883 sec -INFO: No Floating Point Exceptions have been reported - 4,955,825,709 cycles # 2.749 GHz - 12,423,990,964 instructions # 2.51 insn per cycle - 1.804980058 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10322) (512y: 240) (512z: 0) +TOTAL : 1.855976 sec + 5,031,540,017 cycles # 2.706 GHz + 12,343,462,839 instructions # 2.45 insn per cycle + 1.860103785 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10307) (512y: 226) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.851374e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.863307e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.863307e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.836346e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.848432e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.848432e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.400794 sec -INFO: No Floating Point Exceptions have been reported - 4,218,682,996 cycles # 1.755 GHz - 6,496,899,309 instructions # 1.54 insn per cycle - 2.406253121 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1806) (512y: 190) (512z: 9358) +TOTAL : 2.405420 sec + 4,109,302,173 cycles # 1.706 GHz + 6,383,895,140 instructions # 1.55 insn per cycle + 2.409513085 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1734) (512y: 178) (512z: 9357) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling new file mode 100644 index 0000000000..f43e214106 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling @@ -0,0 +1,118 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_15:45:06 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.314898e+04 1 256 +1.332401e+04 2 256 +1.369745e+04 4 256 +1.359022e+04 8 256 +1.360893e+04 16 256 +1.354758e+04 32 256 +1.335068e+04 64 256 +1.340355e+04 128 256 +1.338225e+04 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +6.222590e+03 1 32 +1.054070e+04 2 32 +1.256578e+04 4 32 +1.334543e+04 8 32 +1.351998e+04 16 32 +1.363026e+04 32 32 +1.353031e+04 64 32 +1.331302e+04 128 32 +1.311792e+04 256 32 +1.318049e+04 512 32 +1.308983e+04 1024 32 +1.314766e+04 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.572551e+01 1 256 +7.477397e+01 2 256 +7.590781e+01 4 256 +### CPU: scaling test 32 +7.544857e+01 1 32 +7.629914e+01 2 32 +7.644630e+01 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.436664e+02 1 256 +1.430259e+02 2 256 +1.425156e+02 4 256 +### CPU: scaling test 32 +1.332283e+02 1 32 +1.407923e+02 2 32 +1.434345e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.322512e+02 1 256 +3.302235e+02 2 256 +3.299895e+02 4 256 +### CPU: scaling test 32 +3.290820e+02 1 32 +3.272276e+02 2 32 +3.284861e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.744622e+02 1 256 +3.794847e+02 2 256 +3.813583e+02 4 256 +### CPU: scaling test 32 +3.817338e+02 1 32 +3.782027e+02 2 32 +3.808702e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.362403e+02 1 256 +3.316419e+02 2 256 +3.338911e+02 4 256 +### CPU: scaling test 32 +3.305571e+02 1 32 +3.318824e+02 2 32 +3.293878e+02 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 17692fc5fb..cc68408e75 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,215 +25,189 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:07:10 +DATE: 2025-10-11_15:29:32 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.059500e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.059934e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.060148e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.298542e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.302743e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.303449e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.453264 sec -INFO: No Floating Point Exceptions have been reported - 8,089,923,192 cycles # 2.904 GHz - 15,932,007,883 instructions # 1.97 insn per cycle - 2.843483231 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.859583 sec + 3,373,995,346 cycles # 2.854 GHz + 5,824,456,888 instructions # 1.73 insn per cycle + 1.243469488 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.246459e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.248360e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.248591e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.340939e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.341409e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.341443e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.019480 sec -INFO: No Floating Point Exceptions have been reported - 12,563,980,059 cycles # 2.886 GHz - 29,860,686,581 instructions # 2.38 insn per cycle - 4.410635015 seconds time elapsed +TOTAL : 2.040862 sec + 6,994,210,497 cycles # 2.880 GHz + 14,374,198,066 instructions # 2.06 insn per cycle + 2.485321107 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/GPU) = 9.8722595284406675E-003 +Relative difference = 3.5164777636791134e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.535286e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.535490e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.535490e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.481211e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.481430e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.481430e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.007645 sec -INFO: No Floating Point Exceptions have been reported - 18,987,096,753 cycles # 2.709 GHz - 53,904,905,030 instructions # 2.84 insn per cycle - 7.011475835 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32424) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.060224 sec + 18,790,658,377 cycles # 2.660 GHz + 53,598,343,943 instructions # 2.85 insn per cycle + 7.064353743 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.576045e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.576133e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.576133e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.428763e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.428836e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.428836e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.352060 sec -INFO: No Floating Point Exceptions have been reported - 9,813,557,960 cycles # 2.925 GHz - 27,153,109,398 instructions # 2.77 insn per cycle - 3.355902855 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96492) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.697310 sec + 9,985,153,992 cycles # 2.699 GHz + 27,152,471,347 instructions # 2.72 insn per cycle + 3.701453086 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96385) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.392533e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.392946e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.392946e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.245847e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.246221e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.246221e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.558312 sec -INFO: No Floating Point Exceptions have been reported - 4,259,121,658 cycles # 2.728 GHz - 9,591,809,021 instructions # 2.25 insn per cycle - 1.562248696 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84961) (512y: 0) (512z: 0) +TOTAL : 1.628561 sec + 4,350,647,315 cycles # 2.666 GHz + 9,591,385,784 instructions # 2.20 insn per cycle + 1.632600458 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84998) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.852746e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.853256e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.853256e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.817880e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.818408e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.818408e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.371089 sec -INFO: No Floating Point Exceptions have been reported - 3,728,351,942 cycles # 2.713 GHz - 8,515,110,933 instructions # 2.28 insn per cycle - 1.374961080 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80609) (512y: 90) (512z: 0) +TOTAL : 1.385265 sec + 3,747,713,325 cycles # 2.699 GHz + 8,516,229,683 instructions # 2.27 insn per cycle + 1.389377029 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80598) (512y: 55) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.432608e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.433087e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.433087e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.278490e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.278974e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.278974e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.541076 sec -INFO: No Floating Point Exceptions have been reported - 2,702,698,179 cycles # 1.750 GHz - 4,282,306,811 instructions # 1.58 insn per cycle - 1.545099546 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2856) (512y: 102) (512z:79114) +TOTAL : 1.612258 sec + 2,716,765,553 cycles # 1.682 GHz + 4,276,097,512 instructions # 1.57 insn per cycle + 1.616451427 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2866) (512y: 71) (512z:79097) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..8b91486c13 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling @@ -0,0 +1,118 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_16:01:16 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.582972e+04 1 256 +1.581496e+04 2 256 +1.648948e+04 4 256 +1.646203e+04 8 256 +1.669439e+04 16 256 +1.647826e+04 32 256 +1.616020e+04 64 256 +1.617952e+04 128 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +6.365790e+03 1 32 +1.117842e+04 2 32 +1.456730e+04 4 32 +1.611806e+04 8 32 +1.598649e+04 16 32 +1.653700e+04 32 32 +1.595595e+04 64 32 +1.589958e+04 128 32 +1.560604e+04 256 32 +1.549794e+04 512 32 +1.560588e+04 1024 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.550960e+01 1 256 +7.583079e+01 2 256 +7.562936e+01 4 256 +### CPU: scaling test 32 +7.095115e+01 1 32 +7.526184e+01 2 32 +7.561728e+01 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.416397e+02 1 256 +1.419941e+02 2 256 +1.424152e+02 4 256 +### CPU: scaling test 32 +1.379937e+02 1 32 +1.386213e+02 2 32 +1.419191e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.312097e+02 1 256 +3.311144e+02 2 256 +3.322186e+02 4 256 +### CPU: scaling test 32 +3.304901e+02 1 32 +3.322880e+02 2 32 +3.277376e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.821829e+02 1 256 +3.805165e+02 2 256 +3.788227e+02 4 256 +### CPU: scaling test 32 +3.729139e+02 1 32 +3.757926e+02 2 32 +3.738019e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.317613e+02 1 256 +3.319298e+02 2 256 +3.365958e+02 4 256 +### CPU: scaling test 32 +3.353901e+02 1 32 +3.366346e+02 2 32 +3.378136e+02 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 1cf857b709..4b40dd2c65 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,239 +25,197 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:40:38 +DATE: 2025-10-11_16:32:38 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.054825e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.057209e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.057209e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.248729e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.286569e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.286569e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.388056 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 7,931,671,790 cycles # 2.924 GHz - 17,623,602,431 instructions # 2.22 insn per cycle - 2.770306640 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge +TOTAL : 0.825135 sec + 3,263,718,300 cycles # 2.850 GHz + 5,063,977,049 instructions # 1.55 insn per cycle + 1.201910757 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.226146e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.260909e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.260909e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.351586e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.359293e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.359293e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.992337 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 12,629,951,963 cycles # 2.926 GHz - 29,269,734,483 instructions # 2.32 insn per cycle - 4.375813430 seconds time elapsed +TOTAL : 2.006826 sec + 6,868,164,513 cycles # 2.869 GHz + 12,771,043,874 instructions # 1.86 insn per cycle + 2.451670895 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/GPU) = 9.8722595284406675E-003 +Relative difference = 3.5164777636791134e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.889828e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.890068e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.890068e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.508335e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.508560e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.508560e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.696425 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 18,936,809,312 cycles # 2.827 GHz - 53,907,854,112 instructions # 2.85 insn per cycle - 6.700731218 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32424) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.038136 sec + 18,717,847,899 cycles # 2.659 GHz + 53,598,418,673 instructions # 2.86 insn per cycle + 7.042371275 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.586455e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.586548e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.586548e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.418673e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.418747e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.418747e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.330534 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 9,805,857,457 cycles # 2.941 GHz - 27,153,288,385 instructions # 2.77 insn per cycle - 3.335034911 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96492) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.725271 sec + 9,999,898,907 cycles # 2.682 GHz + 27,154,408,541 instructions # 2.72 insn per cycle + 3.729470107 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96385) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.386158e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.386550e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.386550e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.288517e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.288903e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.288903e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.562759 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,284,138,212 cycles # 2.735 GHz - 9,593,930,746 instructions # 2.24 insn per cycle - 1.567182963 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84961) (512y: 0) (512z: 0) +TOTAL : 1.608418 sec + 4,321,971,855 cycles # 2.681 GHz + 9,593,457,987 instructions # 2.22 insn per cycle + 1.612824235 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84998) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.892770e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.893321e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.893321e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.731794e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.732300e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.732300e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.359134 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 3,729,884,424 cycles # 2.737 GHz - 8,517,697,790 instructions # 2.28 insn per cycle - 1.363667603 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80609) (512y: 90) (512z: 0) +TOTAL : 1.417269 sec + 3,781,284,257 cycles # 2.661 GHz + 8,518,492,306 instructions # 2.25 insn per cycle + 1.421504706 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80598) (512y: 55) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.423206e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.423718e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.423718e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.320041e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.320569e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.320569e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.547281 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,698,269,777 cycles # 1.739 GHz - 4,283,935,635 instructions # 1.59 insn per cycle - 1.552053679 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2856) (512y: 102) (512z:79114) +TOTAL : 1.593109 sec + 2,718,981,575 cycles # 1.703 GHz + 4,277,734,000 instructions # 1.57 insn per cycle + 1.597391554 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2866) (512y: 71) (512z:79097) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index bc67f5cacf..a8f385308e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,215 +25,189 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:08:38 +DATE: 2025-10-11_15:31:21 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.058591e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.058974e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.059077e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.314413e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.318852e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.319620e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.451568 sec -INFO: No Floating Point Exceptions have been reported - 8,115,809,761 cycles # 2.919 GHz - 18,292,352,744 instructions # 2.25 insn per cycle - 2.835762935 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.824375 sec + 3,263,300,002 cycles # 2.859 GHz + 5,743,287,797 instructions # 1.76 insn per cycle + 1.201709138 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.228388e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.230439e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.230672e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.342823e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.343338e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.343373e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.019291 sec -INFO: No Floating Point Exceptions have been reported - 12,725,284,497 cycles # 2.922 GHz - 29,505,773,730 instructions # 2.32 insn per cycle - 4.410068917 seconds time elapsed +TOTAL : 2.030004 sec + 6,944,802,894 cycles # 2.872 GHz + 14,733,879,509 instructions # 2.12 insn per cycle + 2.474432206 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/GPU) = 9.8722595284406675E-003 +Relative difference = 3.5164777636791134e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.905987e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.906203e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.906203e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.570860e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.571065e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.571065e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.685741 sec -INFO: No Floating Point Exceptions have been reported - 18,901,791,742 cycles # 2.826 GHz - 53,936,334,501 instructions # 2.85 insn per cycle - 6.689520607 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32022) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.976560 sec + 18,730,478,677 cycles # 2.684 GHz + 53,589,432,540 instructions # 2.86 insn per cycle + 6.980695916 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32012) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.555988e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.556078e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.556078e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.411301e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.411372e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.411372e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.395185 sec -INFO: No Floating Point Exceptions have been reported - 9,954,308,036 cycles # 2.929 GHz - 27,130,330,125 instructions # 2.73 insn per cycle - 3.399134205 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96368) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.742394 sec + 10,077,544,611 cycles # 2.691 GHz + 27,148,181,137 instructions # 2.69 insn per cycle + 3.746519189 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96336) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.364235e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.364649e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.364649e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.358190e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.358704e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.358704e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.571658 sec -INFO: No Floating Point Exceptions have been reported - 4,284,967,782 cycles # 2.721 GHz - 9,585,542,173 instructions # 2.24 insn per cycle - 1.575575323 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84968) (512y: 0) (512z: 0) +TOTAL : 1.574465 sec + 4,261,924,263 cycles # 2.701 GHz + 9,596,051,273 instructions # 2.25 insn per cycle + 1.578699681 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85013) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.898680e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.899276e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.899276e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.774770e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.775320e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.775320e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.358371 sec -INFO: No Floating Point Exceptions have been reported - 3,717,774,700 cycles # 2.731 GHz - 8,507,853,536 instructions # 2.29 insn per cycle - 1.362296235 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80632) (512y: 240) (512z: 0) +TOTAL : 1.400584 sec + 3,755,242,155 cycles # 2.675 GHz + 8,521,276,194 instructions # 2.27 insn per cycle + 1.404663616 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80635) (512y: 225) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.399522e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.400013e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.400013e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.329909e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.330461e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.330461e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.555521 sec -INFO: No Floating Point Exceptions have been reported - 2,693,302,897 cycles # 1.729 GHz - 4,281,674,096 instructions # 1.59 insn per cycle - 1.559394081 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2693) (512y: 184) (512z:79098) +TOTAL : 1.587980 sec + 2,712,476,158 cycles # 1.704 GHz + 4,282,456,457 instructions # 1.58 insn per cycle + 1.592350341 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2702) (512y: 175) (512z:79107) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling new file mode 100644 index 0000000000..2d50000d27 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling @@ -0,0 +1,118 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_15:49:04 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +3.189617e+04 1 256 +3.247454e+04 2 256 +3.572888e+04 4 256 +3.576406e+04 8 256 +3.574054e+04 16 256 +3.604686e+04 32 256 +3.591831e+04 64 256 +3.590498e+04 128 256 +3.586335e+04 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +7.716223e+03 1 32 +1.405251e+04 2 32 +2.073573e+04 4 32 +2.779764e+04 8 32 +3.326750e+04 16 32 +3.550921e+04 32 32 +3.542979e+04 64 32 +3.536735e+04 128 32 +3.605303e+04 256 32 +3.612470e+04 512 32 +3.604579e+04 1024 32 +3.604477e+04 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.499895e+01 1 256 +8.500354e+01 2 256 +8.502793e+01 4 256 +### CPU: scaling test 32 +8.566387e+01 1 32 +8.564579e+01 2 32 +8.546968e+01 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.082111e+02 1 256 +3.057097e+02 2 256 +3.015791e+02 4 256 +### CPU: scaling test 32 +3.031632e+02 1 32 +3.047989e+02 2 32 +3.016953e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.617272e+02 1 256 +6.661900e+02 2 256 +6.680386e+02 4 256 +### CPU: scaling test 32 +6.677614e+02 1 32 +6.719546e+02 2 32 +6.659846e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.611249e+02 1 256 +7.606905e+02 2 256 +7.604096e+02 4 256 +### CPU: scaling test 32 +7.550844e+02 1 32 +7.531491e+02 2 32 +7.562334e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.623690e+02 1 256 +6.648693e+02 2 256 +6.677195e+02 4 256 +### CPU: scaling test 32 +6.549910e+02 1 32 +6.592485e+02 2 32 +6.593529e+02 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index e477be7c61..8d906ea4bc 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,215 +25,189 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:13:00 +DATE: 2025-10-11_15:36:41 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.207250e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.207995e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.208247e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.762040 sec -INFO: No Floating Point Exceptions have been reported - 5,937,636,063 cycles # 2.916 GHz - 12,374,083,331 instructions # 2.08 insn per cycle - 2.091996677 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 3.066576e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.085305e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.089254e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824900e-06 ) GeV^-6 +TOTAL : 0.755600 sec + 2,946,115,284 cycles # 2.846 GHz + 5,005,757,693 instructions # 1.70 insn per cycle + 1.092047091 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.149439e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.150073e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.150179e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.576872e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.578746e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.578931e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 -TOTAL : 2.066345 sec -INFO: No Floating Point Exceptions have been reported - 6,803,203,568 cycles # 2.918 GHz - 14,656,096,283 instructions # 2.15 insn per cycle - 2.390130877 seconds time elapsed +TOTAL : 1.197902 sec + 4,252,156,323 cycles # 2.858 GHz + 7,968,205,533 instructions # 1.87 insn per cycle + 1.544878632 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849635e-03 -Avg ME (F77/GPU) = 9.8712451931260159E-003 -Relative difference = 0.0021940095370046923 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.849633e-03 +Avg ME (F77/GPU) = 9.8712433304319249E-003 +Relative difference = 0.0021940239227111213 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.548424e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.548685e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.548685e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.452149e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.452401e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.452401e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.179003 sec -INFO: No Floating Point Exceptions have been reported - 18,168,840,210 cycles # 2.939 GHz - 53,911,011,794 instructions # 2.97 insn per cycle - 6.183081263 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20141) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.250789 sec + 18,004,786,092 cycles # 2.879 GHz + 53,363,354,008 instructions # 2.96 insn per cycle + 6.254568811 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20332) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087551509E-003 -Relative difference = 2.119780432912131e-08 +Avg ME (F77/C++) = 9.8479612087517612E-003 +Relative difference = 2.1197460131000295e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.395658e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.396067e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.396067e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.083892e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.084249e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.084249e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.556967 sec -INFO: No Floating Point Exceptions have been reported - 4,597,936,627 cycles # 2.947 GHz - 13,808,300,252 instructions # 3.00 insn per cycle - 1.560798930 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:97016) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.714898 sec + 4,637,516,396 cycles # 2.699 GHz + 13,808,277,295 instructions # 2.98 insn per cycle + 1.718840547 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96992) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 Avg ME (F77/C++) = 9.8479546896367235E-003 Relative difference = 3.1515505172940424e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.833708e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.835461e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.835461e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.679481e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.681146e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.681146e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.774770 sec -INFO: No Floating Point Exceptions have been reported - 2,127,367,774 cycles # 2.734 GHz - 4,836,875,487 instructions # 2.27 insn per cycle - 0.778636721 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85494) (512y: 0) (512z: 0) +TOTAL : 0.793237 sec + 2,148,565,219 cycles # 2.697 GHz + 4,837,105,097 instructions # 2.25 insn per cycle + 0.797286288 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85530) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161091246E-003 Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.729108e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.731291e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.731291e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.502213e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.504225e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.504225e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.685221 sec -INFO: No Floating Point Exceptions have been reported - 1,884,703,570 cycles # 2.737 GHz - 4,291,263,737 instructions # 2.28 insn per cycle - 0.689203509 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81183) (512y: 45) (512z: 0) +TOTAL : 0.706205 sec + 1,896,245,897 cycles # 2.672 GHz + 4,291,845,754 instructions # 2.26 insn per cycle + 0.710269657 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81171) (512y: 10) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161091246E-003 Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.870048e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.872187e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.872187e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.536289e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.538258e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.538258e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.771101 sec -INFO: No Floating Point Exceptions have been reported - 1,354,646,750 cycles # 1.748 GHz - 2,162,779,823 instructions # 1.60 insn per cycle - 0.775438585 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3481) (512y: 45) (512z:79330) +TOTAL : 0.810162 sec + 1,363,414,955 cycles # 1.676 GHz + 2,159,791,218 instructions # 1.58 insn per cycle + 0.814367082 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3501) (512y: 15) (512z:79315) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 Avg ME (F77/C++) = 9.8929811982676284E-003 Relative difference = 2.004124217057488e-08 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..b311421434 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling @@ -0,0 +1,118 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_16:05:58 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +3.033893e+04 1 256 +3.187494e+04 2 256 +3.481987e+04 4 256 +3.512251e+04 8 256 +3.538857e+04 16 256 +3.542822e+04 32 256 +3.543221e+04 64 256 +3.537512e+04 128 256 +3.502452e+04 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +7.725986e+03 1 32 +1.328194e+04 2 32 +1.942036e+04 4 32 +2.633854e+04 8 32 +3.294887e+04 16 32 +3.493545e+04 32 32 +3.529299e+04 64 32 +3.546637e+04 128 32 +3.548686e+04 256 32 +3.523534e+04 512 32 +3.522952e+04 1024 32 +3.514012e+04 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.495344e+01 1 256 +8.539448e+01 2 256 +8.496927e+01 4 256 +### CPU: scaling test 32 +8.470460e+01 1 32 +8.470926e+01 2 32 +8.506051e+01 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.029024e+02 1 256 +3.058068e+02 2 256 +3.092272e+02 4 256 +### CPU: scaling test 32 +3.088673e+02 1 32 +3.061911e+02 2 32 +3.071123e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.653819e+02 1 256 +6.661146e+02 2 256 +6.676979e+02 4 256 +### CPU: scaling test 32 +6.681941e+02 1 32 +6.675336e+02 2 32 +6.688978e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.615474e+02 1 256 +7.624411e+02 2 256 +7.580407e+02 4 256 +### CPU: scaling test 32 +7.724123e+02 1 32 +7.622893e+02 2 32 +7.629688e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.726799e+02 1 256 +6.675111e+02 2 256 +6.619522e+02 4 256 +### CPU: scaling test 32 +6.616673e+02 1 32 +6.588386e+02 2 32 +6.622712e+02 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 09d523a948..66637c5d79 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,239 +25,197 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:42:06 +DATE: 2025-10-11_16:34:27 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.291704e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.296560e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.296560e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187093e-05 +- 9.825663e-06 ) GeV^-6 -TOTAL : 1.680127 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 5,675,085,647 cycles # 2.923 GHz - 11,509,492,893 instructions # 2.03 insn per cycle - 1.997903242 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge +EvtsPerSec[Rmb+ME] (23) = ( 2.846569e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.930073e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.930073e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 +TOTAL : 0.744004 sec + 2,812,928,508 cycles # 2.768 GHz + 4,058,280,243 instructions # 1.44 insn per cycle + 1.074142514 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.120892e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.132073e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.132073e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856440e-04 +- 8.331091e-05 ) GeV^-6 -TOTAL : 2.037220 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,712,310,342 cycles # 2.924 GHz - 13,777,135,261 instructions # 2.05 insn per cycle - 2.354099539 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.542471e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.575116e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.575116e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856440e-04 +- 8.331090e-05 ) GeV^-6 +TOTAL : 1.186896 sec + 4,180,690,234 cycles # 2.849 GHz + 8,037,777,996 instructions # 1.92 insn per cycle + 1.534789099 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849635e-03 -Avg ME (F77/GPU) = 9.8712451931260159E-003 -Relative difference = 0.0021940095370046923 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.849633e-03 +Avg ME (F77/GPU) = 9.8712433304319249E-003 +Relative difference = 0.0021940239227111213 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.574125e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.574397e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.574397e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.504304e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.504560e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.504560e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.159980 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 18,121,008,944 cycles # 2.940 GHz - 53,916,989,652 instructions # 2.98 insn per cycle - 6.164330765 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20141) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.212057 sec + 17,925,660,588 cycles # 2.884 GHz + 53,364,413,300 instructions # 2.98 insn per cycle + 6.216192253 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20332) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087551509E-003 -Relative difference = 2.119780432912131e-08 +Avg ME (F77/C++) = 9.8479612087517612E-003 +Relative difference = 2.1197460131000295e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.371688e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.372089e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.372089e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.026780e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.027128e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.027128e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.568419 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,624,959,734 cycles # 2.942 GHz - 13,809,578,618 instructions # 2.99 insn per cycle - 1.572870258 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:97016) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.746031 sec + 4,640,321,340 cycles # 2.653 GHz + 13,810,267,539 instructions # 2.98 insn per cycle + 1.750270483 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96992) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 Avg ME (F77/C++) = 9.8479546896367235E-003 Relative difference = 3.1515505172940424e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.853120e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.854860e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.854860e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.541416e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.543021e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.543021e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.772760 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,127,660,282 cycles # 2.740 GHz - 4,839,303,130 instructions # 2.27 insn per cycle - 0.777110537 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85494) (512y: 0) (512z: 0) +TOTAL : 0.809578 sec + 2,161,931,873 cycles # 2.659 GHz + 4,839,517,439 instructions # 2.24 insn per cycle + 0.813642934 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85530) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161091246E-003 Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.707103e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.709607e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.709607e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.420966e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.422988e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.422988e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.687680 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,889,891,398 cycles # 2.733 GHz - 4,293,271,631 instructions # 2.27 insn per cycle - 0.692031150 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81183) (512y: 45) (512z: 0) +TOTAL : 0.714158 sec + 1,911,038,749 cycles # 2.664 GHz + 4,293,943,131 instructions # 2.25 insn per cycle + 0.718267339 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81171) (512y: 10) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161091246E-003 Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.738421e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.740575e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.740575e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.647126e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.649133e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.649133e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.785848 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,358,106,687 cycles # 1.720 GHz - 2,165,384,980 instructions # 1.59 insn per cycle - 0.790493646 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3481) (512y: 45) (512z:79330) +TOTAL : 0.797274 sec + 1,365,650,123 cycles # 1.706 GHz + 2,161,762,081 instructions # 1.58 insn per cycle + 0.801641364 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3501) (512y: 15) (512z:79315) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 Avg ME (F77/C++) = 9.8929811982676284E-003 Relative difference = 2.004124217057488e-08 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 33a64296d4..a85d1bcb39 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,215 +25,189 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:14:03 +DATE: 2025-10-11_15:38:06 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.196404e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.197145e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.197475e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.762965 sec -INFO: No Floating Point Exceptions have been reported - 5,951,937,078 cycles # 2.924 GHz - 11,910,577,864 instructions # 2.00 insn per cycle - 2.092003198 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 3.071043e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.090506e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.094612e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824900e-06 ) GeV^-6 +TOTAL : 0.757789 sec + 2,958,910,358 cycles # 2.847 GHz + 4,794,775,632 instructions # 1.62 insn per cycle + 1.096595085 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.150073e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.150749e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.150840e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.567606e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.569510e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.569696e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 -TOTAL : 2.074025 sec -INFO: No Floating Point Exceptions have been reported - 6,857,187,374 cycles # 2.930 GHz - 14,190,515,168 instructions # 2.07 insn per cycle - 2.396988151 seconds time elapsed +TOTAL : 1.206702 sec + 4,225,242,901 cycles # 2.841 GHz + 8,156,770,765 instructions # 1.93 insn per cycle + 1.554101217 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849635e-03 -Avg ME (F77/GPU) = 9.8712451931260107E-003 -Relative difference = 0.0021940095370041636 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.849633e-03 +Avg ME (F77/GPU) = 9.8712433304319249E-003 +Relative difference = 0.0021940239227111213 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.597266e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.597536e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.597536e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.507145e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.507418e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.507418e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.144692 sec -INFO: No Floating Point Exceptions have been reported - 18,086,727,911 cycles # 2.942 GHz - 53,895,836,183 instructions # 2.98 insn per cycle - 6.148512893 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20141) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.208388 sec + 17,992,278,108 cycles # 2.897 GHz + 53,336,143,963 instructions # 2.96 insn per cycle + 6.212278042 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087572898E-003 -Relative difference = 2.1198021522715588e-08 +Avg ME (F77/C++) = 9.8479612087558014E-003 +Relative difference = 2.119787038556726e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.388656e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.389069e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.389069e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.069142e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.069523e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.069523e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.560721 sec -INFO: No Floating Point Exceptions have been reported - 4,571,260,015 cycles # 2.924 GHz - 13,800,942,063 instructions # 3.02 insn per cycle - 1.564719207 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96651) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.722052 sec + 4,637,939,725 cycles # 2.688 GHz + 13,805,971,610 instructions # 2.98 insn per cycle + 1.726097842 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96840) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 Avg ME (F77/C++) = 9.8479546896065809E-003 Relative difference = 3.151856596628469e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.702410e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.704003e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.704003e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.610751e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.612520e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.612520e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.789887 sec -INFO: No Floating Point Exceptions have been reported - 2,151,012,254 cycles # 2.712 GHz - 4,840,938,021 instructions # 2.25 insn per cycle - 0.793816354 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85884) (512y: 0) (512z: 0) +TOTAL : 0.800943 sec + 2,170,709,754 cycles # 2.698 GHz + 4,844,490,730 instructions # 2.23 insn per cycle + 0.805141444 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85852) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161091923E-003 Relative difference = 1.85880227405429e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.657646e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.659745e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.659745e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.606901e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.608951e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.608951e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.691425 sec -INFO: No Floating Point Exceptions have been reported - 1,894,431,690 cycles # 2.727 GHz - 4,294,884,277 instructions # 2.27 insn per cycle - 0.695223368 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81725) (512y: 25) (512z: 0) +TOTAL : 0.696038 sec + 1,884,685,200 cycles # 2.695 GHz + 4,299,634,626 instructions # 2.28 insn per cycle + 0.700035846 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81642) (512y: 10) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161091923E-003 Relative difference = 1.85880227405429e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.673392e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.675470e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.675470e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.489547e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.491608e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.491608e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.793743 sec -INFO: No Floating Point Exceptions have been reported - 1,366,656,580 cycles # 1.715 GHz - 2,169,713,805 instructions # 1.59 insn per cycle - 0.797745119 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4092) (512y: 32) (512z:79551) +TOTAL : 0.816037 sec + 1,366,505,808 cycles # 1.668 GHz + 2,169,050,969 instructions # 1.59 insn per cycle + 0.820326650 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4103) (512y: 24) (512z:79552) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 Avg ME (F77/C++) = 9.8929811982957326E-003 Relative difference = 2.0044082998332894e-08 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling new file mode 100644 index 0000000000..53bb1cfda7 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling @@ -0,0 +1,118 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_15:47:09 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.616958e+04 1 256 +1.637015e+04 2 256 +1.727451e+04 4 256 +1.703878e+04 8 256 +1.713757e+04 16 256 +1.692549e+04 32 256 +1.662520e+04 64 256 +1.655737e+04 128 256 +1.660158e+04 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +6.521951e+03 1 32 +1.124531e+04 2 32 +1.474858e+04 4 32 +1.618404e+04 8 32 +1.651807e+04 16 32 +1.695250e+04 32 32 +1.681150e+04 64 32 +1.629231e+04 128 32 +1.600637e+04 256 32 +1.595680e+04 512 32 +1.609152e+04 1024 32 +1.606225e+04 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.530837e+01 1 256 +7.486415e+01 2 256 +7.494008e+01 4 256 +### CPU: scaling test 32 +7.525282e+01 1 32 +7.477017e+01 2 32 +7.524610e+01 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.548840e+02 1 256 +1.522353e+02 2 256 +1.543201e+02 4 256 +### CPU: scaling test 32 +1.576268e+02 1 32 +1.582873e+02 2 32 +1.506909e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.557154e+02 1 256 +3.547270e+02 2 256 +3.557554e+02 4 256 +### CPU: scaling test 32 +3.614135e+02 1 32 +3.600100e+02 2 32 +3.596141e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.001766e+02 1 256 +4.125953e+02 2 256 +4.090213e+02 4 256 +### CPU: scaling test 32 +4.084924e+02 1 32 +4.056804e+02 2 32 +4.080579e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.519966e+02 1 256 +3.510473e+02 2 256 +3.460383e+02 4 256 +### CPU: scaling test 32 +3.459963e+02 1 32 +3.417875e+02 2 32 +3.469620e+02 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index a2a6307c02..686f1c46c7 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,215 +25,189 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:10:06 +DATE: 2025-10-11_15:33:09 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.665934e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.666477e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.666666e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.606719e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.613205e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.614399e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.202831 sec -INFO: No Floating Point Exceptions have been reported - 7,373,914,452 cycles # 2.913 GHz - 16,351,055,335 instructions # 2.22 insn per cycle - 2.588547453 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.810711 sec + 3,229,171,179 cycles # 2.859 GHz + 5,715,641,917 instructions # 1.77 insn per cycle + 1.191471752 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.110897e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.111188e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.111222e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.654245e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.655018e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.655075e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.442430 sec -INFO: No Floating Point Exceptions have been reported - 11,070,694,428 cycles # 2.924 GHz - 25,628,142,124 instructions # 2.31 insn per cycle - 3.841933628 seconds time elapsed +TOTAL : 1.784420 sec + 6,293,809,246 cycles # 2.879 GHz + 12,593,045,017 instructions # 2.00 insn per cycle + 2.242570146 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 +Avg ME (F77/GPU) = 9.8722595419029543E-003 +Relative difference = 3.502841288596502e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.567548e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.567783e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.567783e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.469254e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.469466e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.469466e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.978728 sec -INFO: No Floating Point Exceptions have been reported - 19,201,924,470 cycles # 2.751 GHz - 54,137,446,015 instructions # 2.82 insn per cycle - 6.982563293 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32000) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.071086 sec + 19,047,832,122 cycles # 2.693 GHz + 53,831,188,921 instructions # 2.83 insn per cycle + 7.075248115 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.526848e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.526939e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.526939e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.520487e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.520570e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.520570e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.460419 sec -INFO: No Floating Point Exceptions have been reported - 9,442,620,757 cycles # 2.727 GHz - 26,188,001,033 instructions # 2.77 insn per cycle - 3.464377416 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96049) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.474834 sec + 9,355,185,296 cycles # 2.691 GHz + 25,920,357,243 instructions # 2.77 insn per cycle + 3.478986906 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96092) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.548969e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.549418e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.549418e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.467313e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.467816e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.467816e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.489614 sec -INFO: No Floating Point Exceptions have been reported - 4,075,741,004 cycles # 2.731 GHz - 9,249,825,182 instructions # 2.27 insn per cycle - 1.493453651 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) +TOTAL : 1.523962 sec + 3,999,825,927 cycles # 2.619 GHz + 9,105,365,579 instructions # 2.28 insn per cycle + 1.528167166 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83929) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.098256e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.098850e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.098850e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.083261e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.083882e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.083882e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.290484 sec -INFO: No Floating Point Exceptions have been reported - 3,523,951,603 cycles # 2.724 GHz - 8,183,239,467 instructions # 2.32 insn per cycle - 1.294382992 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80015) (512y: 80) (512z: 0) +TOTAL : 1.295937 sec + 3,509,301,061 cycles # 2.701 GHz + 8,040,567,810 instructions # 2.29 insn per cycle + 1.299964950 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79768) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.495372e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.495944e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.495944e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.452173e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.452727e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.452727e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.513924 sec -INFO: No Floating Point Exceptions have been reported - 2,658,314,764 cycles # 1.752 GHz - 4,173,156,780 instructions # 1.57 insn per cycle - 1.517996809 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 92) (512z:78910) +TOTAL : 1.532017 sec + 2,596,809,497 cycles # 1.691 GHz + 4,060,850,927 instructions # 1.56 insn per cycle + 1.536186135 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2509) (512y: 61) (512z:78957) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..a739246eca --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling @@ -0,0 +1,118 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_16:03:38 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.525607e+04 1 256 +1.592603e+04 2 256 +1.694297e+04 4 256 +1.694752e+04 8 256 +1.680152e+04 16 256 +1.667228e+04 32 256 +1.648853e+04 64 256 +1.642335e+04 128 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +5.344354e+03 1 32 +9.059524e+03 2 32 +1.316587e+04 4 32 +1.535902e+04 8 32 +1.599627e+04 16 32 +1.690040e+04 32 32 +1.613824e+04 64 32 +1.606066e+04 128 32 +1.607094e+04 256 32 +1.586333e+04 512 32 +1.570749e+04 1024 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.451618e+01 1 256 +7.447961e+01 2 256 +7.464296e+01 4 256 +### CPU: scaling test 32 +7.454429e+01 1 32 +7.454562e+01 2 32 +7.491906e+01 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.523430e+02 1 256 +1.528849e+02 2 256 +1.545423e+02 4 256 +### CPU: scaling test 32 +1.508465e+02 1 32 +1.522871e+02 2 32 +1.514789e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.569891e+02 1 256 +3.579373e+02 2 256 +3.580811e+02 4 256 +### CPU: scaling test 32 +3.582840e+02 1 32 +3.591263e+02 2 32 +3.590191e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.091335e+02 1 256 +4.101923e+02 2 256 +4.047677e+02 4 256 +### CPU: scaling test 32 +4.052367e+02 1 32 +4.049500e+02 2 32 +4.058871e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.457958e+02 1 256 +3.518110e+02 2 256 +3.523691e+02 4 256 +### CPU: scaling test 32 +3.457462e+02 1 32 +3.517526e+02 2 32 +3.507713e+02 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 67fff86657..2c63694669 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,215 +25,189 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:11:33 +DATE: 2025-10-11_15:34:55 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.667678e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.668217e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.668387e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.591312e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.597916e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.599015e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.202686 sec -INFO: No Floating Point Exceptions have been reported - 7,336,606,843 cycles # 2.899 GHz - 15,241,236,080 instructions # 2.08 insn per cycle - 2.586897924 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.809629 sec + 3,237,669,928 cycles # 2.864 GHz + 5,681,011,752 instructions # 1.75 insn per cycle + 1.192308721 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.107552e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.107855e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.107889e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.667525e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.668322e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.668373e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.440073 sec -INFO: No Floating Point Exceptions have been reported - 11,052,276,434 cycles # 2.923 GHz - 25,411,180,343 instructions # 2.30 insn per cycle - 3.836365671 seconds time elapsed +TOTAL : 1.762250 sec + 6,151,588,956 cycles # 2.862 GHz + 12,789,871,898 instructions # 2.08 insn per cycle + 2.206834958 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 +Avg ME (F77/GPU) = 9.8722595419029543E-003 +Relative difference = 3.502841288596502e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.653903e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.654105e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.654105e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.441824e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.442030e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.442030e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.889827 sec -INFO: No Floating Point Exceptions have been reported - 19,201,166,017 cycles # 2.786 GHz - 54,161,677,415 instructions # 2.82 insn per cycle - 6.893652512 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32202) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.097119 sec + 19,021,241,015 cycles # 2.679 GHz + 53,824,218,201 instructions # 2.83 insn per cycle + 7.101056562 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32012) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.552412e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.552503e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.552503e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.520581e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.520672e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.520672e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.403221 sec -INFO: No Floating Point Exceptions have been reported - 9,295,420,050 cycles # 2.729 GHz - 26,089,296,035 instructions # 2.81 insn per cycle - 3.407123949 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:95935) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.473548 sec + 9,360,233,363 cycles # 2.692 GHz + 25,827,022,283 instructions # 2.76 insn per cycle + 3.477681834 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:95883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.556434e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.556900e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.556900e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.499910e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.500338e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.500338e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.488620 sec -INFO: No Floating Point Exceptions have been reported - 4,059,104,235 cycles # 2.721 GHz - 9,213,839,753 instructions # 2.27 insn per cycle - 1.492560916 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83864) (512y: 0) (512z: 0) +TOTAL : 1.510429 sec + 4,054,458,858 cycles # 2.678 GHz + 9,070,411,764 instructions # 2.24 insn per cycle + 1.514545882 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83452) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.125241e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.125840e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.125840e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.057773e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.058358e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.058358e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.282211 sec -INFO: No Floating Point Exceptions have been reported - 3,511,408,538 cycles # 2.732 GHz - 8,168,208,932 instructions # 2.33 insn per cycle - 1.286095846 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79421) (512y: 230) (512z: 0) +TOTAL : 1.302962 sec + 3,492,520,706 cycles # 2.673 GHz + 8,024,600,361 instructions # 2.30 insn per cycle + 1.307117868 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79136) (512y: 215) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.517573e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.518129e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.518129e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.494027e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.494558e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.494558e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.503444 sec -INFO: No Floating Point Exceptions have been reported - 2,622,176,822 cycles # 1.740 GHz - 4,167,750,292 instructions # 1.59 insn per cycle - 1.507552292 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1879) (512y: 174) (512z:78884) +TOTAL : 1.513587 sec + 2,591,602,459 cycles # 1.708 GHz + 4,056,631,617 instructions # 1.57 insn per cycle + 1.517867253 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1776) (512y: 165) (512z:78888) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling new file mode 100644 index 0000000000..f1df17a77c --- /dev/null +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_15:44:03 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.428635e+06 1 256 +2.986921e+06 2 256 +5.564976e+06 4 256 +1.150400e+07 8 256 +2.254241e+07 16 256 +3.299328e+07 32 256 +3.991678e+07 64 256 +4.342243e+07 128 256 +4.801742e+07 256 256 +5.029240e+07 512 256 +5.134165e+07 1024 256 +### GPU: scaling test 32 +1.949995e+05 1 32 +3.776925e+05 2 32 +7.282783e+05 4 32 +1.483318e+06 8 32 +2.934652e+06 16 32 +4.620001e+06 32 32 +1.110479e+07 64 32 +2.248141e+07 128 32 +3.497298e+07 256 32 +3.843258e+07 512 32 +4.371853e+07 1024 32 +4.702509e+07 2048 32 +4.914143e+07 4096 32 +5.007560e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.018202e+05 1 256 +1.029861e+05 2 256 +1.049904e+05 4 256 +### CPU: scaling test 32 +9.750093e+04 1 32 +9.993083e+04 2 32 +1.029180e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.770505e+05 1 256 +1.765797e+05 2 256 +1.854054e+05 4 256 +### CPU: scaling test 32 +1.484850e+05 1 32 +1.713608e+05 2 32 +1.595040e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.857545e+05 1 256 +3.168191e+05 2 256 +3.177122e+05 4 256 +### CPU: scaling test 32 +2.953038e+05 1 32 +3.077116e+05 2 32 +2.876185e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.080307e+05 1 256 +3.180421e+05 2 256 +3.341884e+05 4 256 +### CPU: scaling test 32 +2.868052e+05 1 32 +3.156394e+05 2 32 +3.097819e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.313974e+05 1 256 +2.307900e+05 2 256 +2.293449e+05 4 256 +### CPU: scaling test 32 +2.313560e+05 1 32 +2.290500e+05 2 32 +2.289947e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 468f6865a8..d112a11495 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:05:51 +DATE: 2025-10-11_15:27:25 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.906944e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.902591e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.013821e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.313564e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.022320e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.232850e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.458221 sec -INFO: No Floating Point Exceptions have been reported - 1,930,997,109 cycles # 2.858 GHz - 2,724,198,211 instructions # 1.41 insn per cycle - 0.805328419 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.462516 sec + 1,997,687,796 cycles # 2.814 GHz + 2,748,418,377 instructions # 1.38 insn per cycle + 0.769002804 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.002453e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.463176e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.675243e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.849800e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.989232e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.162437e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.544639 sec -INFO: No Floating Point Exceptions have been reported - 2,250,691,324 cycles # 2.871 GHz - 3,190,813,390 instructions # 1.42 insn per cycle - 0.843484638 seconds time elapsed +TOTAL : 0.537675 sec + 2,303,047,279 cycles # 2.838 GHz + 3,173,611,128 instructions # 1.38 insn per cycle + 0.868680787 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 -Avg ME (F77/GPU) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 +Avg ME (F77/GPU) = 0.14247482467490463 +Relative difference = 5.286902840821208e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.052668e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.075406e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.075406e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.039909e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.062156e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.062156e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.578445 sec -INFO: No Floating Point Exceptions have been reported - 4,629,037,835 cycles # 2.928 GHz - 13,193,545,970 instructions # 2.85 insn per cycle - 1.584589009 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.595860 sec + 4,617,130,408 cycles # 2.888 GHz + 13,249,342,927 instructions # 2.87 insn per cycle + 1.599801948 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499481 Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.869817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.940106e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.940106e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.827783e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.896147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.896147e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.895982 sec -INFO: No Floating Point Exceptions have been reported - 2,636,174,950 cycles # 2.931 GHz - 7,556,706,256 instructions # 2.87 insn per cycle - 0.901753059 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.915570 sec + 2,669,358,674 cycles # 2.905 GHz + 7,600,949,147 instructions # 2.85 insn per cycle + 0.919765484 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499475 Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.170738e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.377041e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.377041e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.046861e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.237725e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.237725e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.538337 sec -INFO: No Floating Point Exceptions have been reported - 1,492,365,440 cycles # 2.760 GHz - 3,161,633,609 instructions # 2.12 insn per cycle - 0.543901971 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2991) (512y: 0) (512z: 0) +TOTAL : 0.557374 sec + 1,530,133,486 cycles # 2.729 GHz + 3,193,359,124 instructions # 2.09 insn per cycle + 0.561538714 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3021) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.502118e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.753079e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.753079e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.222833e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.436298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.436298e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.488254 sec -INFO: No Floating Point Exceptions have been reported - 1,345,193,436 cycles # 2.734 GHz - 3,015,805,712 instructions # 2.24 insn per cycle - 0.494320620 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2749) (512y: 104) (512z: 0) +TOTAL : 0.527914 sec + 1,448,845,809 cycles # 2.727 GHz + 3,068,216,889 instructions # 2.12 insn per cycle + 0.532005288 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2827) (512y: 84) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.340176e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.450488e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.450488e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.262309e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.366937e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.366937e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.722488 sec -INFO: No Floating Point Exceptions have been reported - 1,326,137,037 cycles # 1.826 GHz - 1,964,340,659 instructions # 1.48 insn per cycle - 0.728328312 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1379) (512y: 106) (512z: 2218) +TOTAL : 0.746275 sec + 1,345,907,467 cycles # 1.795 GHz + 1,981,512,387 instructions # 1.47 insn per cycle + 0.750498916 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1439) (512y: 84) (512z: 2209) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index a32e85fd77..542ec194e9 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,272 +10,231 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:39:10 +DATE: 2025-10-11_16:30:42 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.313371e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.590831e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.590831e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.356662e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.903029e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.903029e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.487212 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,046,207,140 cycles # 2.880 GHz - 3,015,907,255 instructions # 1.47 insn per cycle - 0.769534809 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +TOTAL : 0.490080 sec + 2,074,202,921 cycles # 2.819 GHz + 2,982,362,559 instructions # 1.44 insn per cycle + 0.792779275 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.228660e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.270938e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.270938e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.203461e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.181328e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.181328e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.758730 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,917,079,859 cycles # 2.883 GHz - 4,489,082,127 instructions # 1.54 insn per cycle - 1.069078440 seconds time elapsed +TOTAL : 0.757533 sec + 2,979,284,817 cycles # 2.853 GHz + 4,399,436,734 instructions # 1.48 insn per cycle + 1.101470538 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 -Avg ME (F77/GPU) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 +Avg ME (F77/GPU) = 0.14247482467490463 +Relative difference = 5.286902840821208e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.058535e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.081557e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.081557e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.040166e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.062990e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.062990e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.574537 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,656,483,821 cycles # 2.950 GHz - 13,198,201,576 instructions # 2.83 insn per cycle - 1.579077435 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.601584 sec + 4,649,519,147 cycles # 2.897 GHz + 13,253,744,210 instructions # 2.85 insn per cycle + 1.606011259 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499481 Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.861172e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.931943e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.931943e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.815648e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.884893e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.884893e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.907508 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,678,662,656 cycles # 2.939 GHz - 7,605,263,564 instructions # 2.84 insn per cycle - 0.912202227 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.929220 sec + 2,705,069,112 cycles # 2.900 GHz + 7,649,258,945 instructions # 2.83 insn per cycle + 0.933656370 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499475 Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.153263e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.357026e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.357026e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.970773e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.160922e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.160922e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.547067 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,524,781,245 cycles # 2.767 GHz - 3,210,388,287 instructions # 2.11 insn per cycle - 0.551691801 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2991) (512y: 0) (512z: 0) +TOTAL : 0.579438 sec + 1,570,726,943 cycles # 2.694 GHz + 3,243,232,441 instructions # 2.06 insn per cycle + 0.583677287 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3021) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.508777e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.767060e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.767060e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.172484e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.386570e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.386570e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.494747 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,383,177,469 cycles # 2.773 GHz - 3,064,481,068 instructions # 2.22 insn per cycle - 0.499446571 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2749) (512y: 104) (512z: 0) +TOTAL : 0.544496 sec + 1,490,247,847 cycles # 2.718 GHz + 3,118,276,131 instructions # 2.09 insn per cycle + 0.548976134 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2827) (512y: 84) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.351157e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.462501e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.462501e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.208001e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.313270e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.313270e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.725065 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,357,891,290 cycles # 1.863 GHz - 2,000,455,329 instructions # 1.47 insn per cycle - 0.729577819 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1379) (512y: 106) (512z: 2218) +TOTAL : 0.771513 sec + 1,385,006,024 cycles # 1.787 GHz + 2,018,418,785 instructions # 1.46 insn per cycle + 0.775891856 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1439) (512y: 84) (512z: 2209) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 67eac99bab..c96c0f2bba 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:06:04 +DATE: 2025-10-11_15:27:47 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.866343e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.840904e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.947003e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.222648e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.903995e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.118782e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.463809 sec -INFO: No Floating Point Exceptions have been reported - 1,942,418,108 cycles # 2.861 GHz - 2,721,411,859 instructions # 1.40 insn per cycle - 0.812650633 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.464819 sec + 2,030,821,916 cycles # 2.839 GHz + 2,744,793,219 instructions # 1.35 insn per cycle + 0.772863650 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.997280e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.399599e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.603946e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.790256e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.896792e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.070548e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.538885 sec -INFO: No Floating Point Exceptions have been reported - 2,239,160,610 cycles # 2.873 GHz - 3,203,384,758 instructions # 1.43 insn per cycle - 0.836856412 seconds time elapsed +TOTAL : 0.539655 sec + 2,316,213,602 cycles # 2.850 GHz + 3,194,995,847 instructions # 1.38 insn per cycle + 0.870686173 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 -Avg ME (F77/GPU) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 +Avg ME (F77/GPU) = 0.14247482467490463 +Relative difference = 5.286902840821208e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.060643e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.083213e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.083213e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.036091e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.058176e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.058176e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.565121 sec -INFO: No Floating Point Exceptions have been reported - 4,623,795,988 cycles # 2.948 GHz - 13,181,888,102 instructions # 2.85 insn per cycle - 1.571833324 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 692) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.601117 sec + 4,614,781,714 cycles # 2.877 GHz + 13,227,683,016 instructions # 2.87 insn per cycle + 1.605070443 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 679) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499481 Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.878003e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.949625e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.949625e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.832083e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.900484e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.900484e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.892385 sec -INFO: No Floating Point Exceptions have been reported - 2,641,116,720 cycles # 2.947 GHz - 7,555,506,374 instructions # 2.86 insn per cycle - 0.899472366 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3093) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.913405 sec + 2,666,905,925 cycles # 2.909 GHz + 7,595,681,340 instructions # 2.85 insn per cycle + 0.917462386 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3077) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499475 Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.178148e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.383095e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.383095e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.997059e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.186796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186796e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.535311 sec -INFO: No Floating Point Exceptions have been reported - 1,491,222,481 cycles # 2.767 GHz - 3,161,019,864 instructions # 2.12 insn per cycle - 0.541387025 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2976) (512y: 0) (512z: 0) +TOTAL : 0.566232 sec + 1,532,545,982 cycles # 2.690 GHz + 3,190,811,369 instructions # 2.08 insn per cycle + 0.570104783 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3005) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.523592e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.778898e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.778898e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.138120e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.345703e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.345703e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.485060 sec -INFO: No Floating Point Exceptions have been reported - 1,349,314,232 cycles # 2.763 GHz - 3,012,812,614 instructions # 2.23 insn per cycle - 0.489068736 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2726) (512y: 104) (512z: 0) +TOTAL : 0.542027 sec + 1,447,882,232 cycles # 2.655 GHz + 3,062,649,899 instructions # 2.12 insn per cycle + 0.545967207 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2804) (512y: 84) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.347943e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.459729e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.459729e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.226133e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.328099e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.328099e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.720112 sec -INFO: No Floating Point Exceptions have been reported - 1,326,103,986 cycles # 1.833 GHz - 1,962,664,460 instructions # 1.48 insn per cycle - 0.726078775 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1356) (512y: 106) (512z: 2218) +TOTAL : 0.757778 sec + 1,343,211,600 cycles # 1.765 GHz + 1,978,672,810 instructions # 1.47 insn per cycle + 0.761787399 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1416) (512y: 84) (512z: 2209) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling new file mode 100644 index 0000000000..8a82307bae --- /dev/null +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_15:44:45 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.527045e+06 1 256 +3.131556e+06 2 256 +6.093388e+06 4 256 +1.251780e+07 8 256 +2.244630e+07 16 256 +4.178995e+07 32 256 +6.592442e+07 64 256 +7.658956e+07 128 256 +8.216021e+07 256 256 +8.838611e+07 512 256 +9.244041e+07 1024 256 +### GPU: scaling test 32 +1.864346e+05 1 32 +3.981461e+05 2 32 +7.916041e+05 4 32 +1.446352e+06 8 32 +2.861310e+06 16 32 +6.255536e+06 32 32 +1.192410e+07 64 32 +2.215132e+07 128 32 +4.236701e+07 256 32 +6.877647e+07 512 32 +7.973525e+07 1024 32 +8.551740e+07 2048 32 +9.532558e+07 4096 32 +9.914765e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.054964e+05 1 256 +1.086764e+05 2 256 +1.085879e+05 4 256 +### CPU: scaling test 32 +9.631447e+04 1 32 +1.042281e+05 2 32 +1.016890e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.679848e+05 1 256 +2.830096e+05 2 256 +2.920388e+05 4 256 +### CPU: scaling test 32 +2.003030e+05 1 32 +2.733186e+05 2 32 +2.733314e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.015207e+05 1 256 +5.639568e+05 2 256 +5.644473e+05 4 256 +### CPU: scaling test 32 +5.530113e+05 1 32 +5.540310e+05 2 32 +6.104453e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.318601e+05 1 256 +5.672087e+05 2 256 +5.418454e+05 4 256 +### CPU: scaling test 32 +4.569666e+05 1 32 +5.422212e+05 2 32 +5.271481e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.266468e+05 1 256 +4.319869e+05 2 256 +4.643166e+05 4 256 +### CPU: scaling test 32 +4.562174e+05 1 32 +4.628927e+05 2 32 +4.441638e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index fa95ebd131..3c2f832038 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:06:45 +DATE: 2025-10-11_15:28:49 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.818001e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.982501e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.122889e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.775185e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.659813e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.119856e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.452282 sec -INFO: No Floating Point Exceptions have been reported - 1,920,727,034 cycles # 2.860 GHz - 2,694,517,558 instructions # 1.40 insn per cycle - 0.728408510 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 169 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.460990 sec + 2,032,870,493 cycles # 2.841 GHz + 2,757,410,394 instructions # 1.36 insn per cycle + 0.774218584 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 161 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.287877e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.320334e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.683236e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.197057e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.828077e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.174418e+07 ) sec^-1 MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 -TOTAL : 0.495314 sec -INFO: No Floating Point Exceptions have been reported - 2,079,539,950 cycles # 2.850 GHz - 2,952,237,418 instructions # 1.42 insn per cycle - 0.786339466 seconds time elapsed +TOTAL : 0.492525 sec + 2,151,242,968 cycles # 2.846 GHz + 2,972,332,872 instructions # 1.38 insn per cycle + 0.812892837 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247487904286338 -Relative difference = 0.0003670698531228044 +Avg ME (F77/GPU) = 0.14247487171431850 +Relative difference = 0.0003670183967887531 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.109567e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.134660e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.134660e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.088774e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.113486e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113486e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.494759 sec -INFO: No Floating Point Exceptions have been reported - 4,403,081,916 cycles # 2.940 GHz - 12,951,948,710 instructions # 2.94 insn per cycle - 1.498420981 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 645) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.523041 sec + 4,438,181,728 cycles # 2.908 GHz + 12,997,899,281 instructions # 2.93 insn per cycle + 1.526979824 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246861273719524 Relative difference = 8.940352641194861e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.886806e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.066754e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.066754e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.813324e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.986491e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.986491e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.584675 sec -INFO: No Floating Point Exceptions have been reported - 1,726,276,919 cycles # 2.937 GHz - 4,542,407,737 instructions # 2.63 insn per cycle - 0.588476135 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3627) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.599748 sec + 1,741,244,369 cycles # 2.889 GHz + 4,565,155,972 instructions # 2.62 insn per cycle + 0.603721432 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3608) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246862329122401 Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.651382e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.346145e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.346145e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.470584e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.128186e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.128186e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.307816 sec -INFO: No Floating Point Exceptions have been reported - 856,647,676 cycles # 2.754 GHz - 1,917,830,464 instructions # 2.24 insn per cycle - 0.311794908 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3580) (512y: 0) (512z: 0) +TOTAL : 0.317328 sec + 874,197,910 cycles # 2.725 GHz + 1,937,671,895 instructions # 2.22 insn per cycle + 0.321309948 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491543012991 Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.083995e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.890169e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.890169e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.732936e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.453145e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.453145e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.287118 sec -INFO: No Floating Point Exceptions have been reported - 801,284,784 cycles # 2.760 GHz - 1,834,043,941 instructions # 2.29 insn per cycle - 0.290894624 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3400) (512y: 22) (512z: 0) +TOTAL : 0.303630 sec + 837,570,844 cycles # 2.728 GHz + 1,865,428,267 instructions # 2.23 insn per cycle + 0.307759201 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3485) (512y: 2) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491543012991 Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.500723e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.948038e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.948038e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.363450e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.779212e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.779212e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.384030 sec -INFO: No Floating Point Exceptions have been reported - 726,928,592 cycles # 1.877 GHz - 1,308,660,654 instructions # 1.80 insn per cycle - 0.387900268 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1964) (512y: 24) (512z: 2435) +TOTAL : 0.396164 sec + 743,365,153 cycles # 1.861 GHz + 1,320,595,546 instructions # 1.78 insn per cycle + 0.400174159 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 2) (512z: 2428) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491576758442 Relative difference = 1.1066920862943416e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 5a6a874489..3158a41f16 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,272 +10,231 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:39:24 +DATE: 2025-10-11_16:31:01 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.958276e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.362856e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.362856e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.164266e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.164377e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.164377e+07 ) sec^-1 MeanMatrixElemValue = ( 2.017654e+01 +- 1.429183e+01 ) GeV^-2 -TOTAL : 0.467586 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,958,269,299 cycles # 2.868 GHz - 2,873,921,299 instructions # 1.47 insn per cycle - 0.741370031 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +TOTAL : 0.466915 sec + 2,002,533,494 cycles # 2.818 GHz + 2,846,516,929 instructions # 1.42 insn per cycle + 0.767921314 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 161 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 169 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.867040e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.953002e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.953002e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.935448e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.962699e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.962699e+07 ) sec^-1 MeanMatrixElemValue = ( 2.609941e+02 +- 2.115589e+02 ) GeV^-2 -TOTAL : 0.638465 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,513,600,406 cycles # 2.877 GHz - 3,810,036,638 instructions # 1.52 insn per cycle - 0.930171723 seconds time elapsed +TOTAL : 0.638881 sec + 2,551,134,973 cycles # 2.829 GHz + 3,814,025,702 instructions # 1.50 insn per cycle + 0.960291968 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247487904286338 -Relative difference = 0.0003670698531228044 +Avg ME (F77/GPU) = 0.14247487171431850 +Relative difference = 0.0003670183967887531 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.115307e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.140507e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.140507e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.072670e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.097133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.097133e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.490082 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,418,597,373 cycles # 2.958 GHz - 12,956,387,401 instructions # 2.93 insn per cycle - 1.494530314 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 645) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.549724 sec + 4,455,261,943 cycles # 2.869 GHz + 13,001,491,970 instructions # 2.92 insn per cycle + 1.553804785 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246861273719524 Relative difference = 8.940352641194861e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.871197e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.051268e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.051268e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.775020e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.950077e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.950077e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.592243 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,749,393,716 cycles # 2.936 GHz - 4,590,457,409 instructions # 2.62 insn per cycle - 0.596762261 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3627) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.612678 sec + 1,763,964,947 cycles # 2.863 GHz + 4,612,364,671 instructions # 2.61 insn per cycle + 0.616741606 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3608) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246862329122401 Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.650062e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.340176e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.340176e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.406265e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.059656e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.059656e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.311783 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 875,769,738 cycles # 2.776 GHz - 1,954,803,706 instructions # 2.23 insn per cycle - 0.316080972 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3580) (512y: 0) (512z: 0) +TOTAL : 0.325484 sec + 894,227,621 cycles # 2.718 GHz + 1,973,650,274 instructions # 2.21 insn per cycle + 0.329612707 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491543012991 Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.042794e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.845843e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.845843e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.495052e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.198837e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.198837e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.293361 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 825,335,769 cycles # 2.779 GHz - 1,870,845,111 instructions # 2.27 insn per cycle - 0.297556229 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3400) (512y: 22) (512z: 0) +TOTAL : 0.321201 sec + 866,167,930 cycles # 2.668 GHz + 1,901,550,421 instructions # 2.20 insn per cycle + 0.325340653 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3485) (512y: 2) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491543012991 Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.484934e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.935540e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.935540e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.189669e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.585230e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.585230e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.390040 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 749,752,693 cycles # 1.904 GHz - 1,350,296,093 instructions # 1.80 insn per cycle - 0.394449871 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1964) (512y: 24) (512z: 2435) +TOTAL : 0.417280 sec + 768,093,760 cycles # 1.825 GHz + 1,361,032,349 instructions # 1.77 insn per cycle + 0.423250195 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 2) (512z: 2428) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491576758442 Relative difference = 1.1066920862943416e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index cea07bf7e8..8874a06c98 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:06:58 +DATE: 2025-10-11_15:29:09 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.801672e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.945717e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.092440e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.726166e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.668422e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.110300e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.453252 sec -INFO: No Floating Point Exceptions have been reported - 1,914,636,683 cycles # 2.859 GHz - 2,699,162,883 instructions # 1.41 insn per cycle - 0.727606605 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 169 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.456732 sec + 1,986,727,615 cycles # 2.822 GHz + 2,734,105,162 instructions # 1.38 insn per cycle + 0.761604044 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 163 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.322683e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.438723e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.801307e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.139451e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.748092e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.065888e+07 ) sec^-1 MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 -TOTAL : 0.493317 sec -INFO: No Floating Point Exceptions have been reported - 2,100,361,107 cycles # 2.862 GHz - 2,955,351,040 instructions # 1.41 insn per cycle - 0.791031778 seconds time elapsed +TOTAL : 0.491750 sec + 2,144,083,987 cycles # 2.843 GHz + 2,965,934,309 instructions # 1.38 insn per cycle + 0.811495819 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247487904286338 -Relative difference = 0.0003670698531228044 +Avg ME (F77/GPU) = 0.14247487171431850 +Relative difference = 0.0003670183967887531 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.112466e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.138003e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.138003e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.088510e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.113295e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113295e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.490381 sec -INFO: No Floating Point Exceptions have been reported - 4,405,341,411 cycles # 2.950 GHz - 12,928,117,316 instructions # 2.93 insn per cycle - 1.494164072 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.523573 sec + 4,436,604,782 cycles # 2.906 GHz + 12,976,159,794 instructions # 2.92 insn per cycle + 1.527521775 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 635) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246861273719524 Relative difference = 8.940352641194861e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.897278e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.076728e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.076728e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.835028e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.015163e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.015163e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.582482 sec -INFO: No Floating Point Exceptions have been reported - 1,724,294,786 cycles # 2.945 GHz - 4,536,655,836 instructions # 2.63 insn per cycle - 0.586223274 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3611) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.596717 sec + 1,741,466,538 cycles # 2.902 GHz + 4,559,733,587 instructions # 2.62 insn per cycle + 0.600733453 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3592) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246862329122401 Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.690817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.397497e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.397497e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.380055e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.028758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.028758e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.305315 sec -INFO: No Floating Point Exceptions have been reported - 857,155,838 cycles # 2.779 GHz - 1,914,615,212 instructions # 2.23 insn per cycle - 0.309003061 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3549) (512y: 0) (512z: 0) +TOTAL : 0.322659 sec + 877,270,879 cycles # 2.691 GHz + 1,934,809,792 instructions # 2.21 insn per cycle + 0.326541378 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3579) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491543012991 Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.056800e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.870570e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.870570e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.601915e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.305503e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.305503e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.288177 sec -INFO: No Floating Point Exceptions have been reported - 804,254,194 cycles # 2.761 GHz - 1,829,977,116 instructions # 2.28 insn per cycle - 0.291930002 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3364) (512y: 22) (512z: 0) +TOTAL : 0.310801 sec + 841,602,182 cycles # 2.678 GHz + 1,861,524,675 instructions # 2.21 insn per cycle + 0.314890210 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3449) (512y: 2) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491543012991 Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.550897e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.994144e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.994144e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.229370e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.636992e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.636992e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.380837 sec -INFO: No Floating Point Exceptions have been reported - 727,485,601 cycles # 1.894 GHz - 1,306,171,995 instructions # 1.80 insn per cycle - 0.384559776 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1928) (512y: 24) (512z: 2435) +TOTAL : 0.407631 sec + 742,675,842 cycles # 1.807 GHz + 1,318,218,015 instructions # 1.77 insn per cycle + 0.411673396 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1996) (512y: 2) (512z: 2428) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491576758442 Relative difference = 1.1066920862943416e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling new file mode 100644 index 0000000000..86c9b7a546 --- /dev/null +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_15:44:24 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.435943e+06 1 256 +3.007907e+06 2 256 +5.634857e+06 4 256 +1.139868e+07 8 256 +2.191875e+07 16 256 +3.261770e+07 32 256 +3.913775e+07 64 256 +4.321439e+07 128 256 +4.782407e+07 256 256 +5.013042e+07 512 256 +5.117203e+07 1024 256 +### GPU: scaling test 32 +1.833223e+05 1 32 +3.625426e+05 2 32 +7.314829e+05 4 32 +1.459646e+06 8 32 +2.859760e+06 16 32 +5.667384e+06 32 32 +1.106459e+07 64 32 +2.218503e+07 128 32 +3.531887e+07 256 32 +3.896073e+07 512 32 +4.341558e+07 1024 32 +4.714542e+07 2048 32 +4.934308e+07 4096 32 +4.999316e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.008880e+05 1 256 +1.037575e+05 2 256 +1.026899e+05 4 256 +### CPU: scaling test 32 +8.543860e+04 1 32 +9.559401e+04 2 32 +9.690869e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.755069e+05 1 256 +1.824668e+05 2 256 +1.862361e+05 4 256 +### CPU: scaling test 32 +1.737091e+05 1 32 +1.676543e+05 2 32 +1.681730e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.270964e+05 1 256 +3.057259e+05 2 256 +3.141285e+05 4 256 +### CPU: scaling test 32 +2.994544e+05 1 32 +3.090295e+05 2 32 +3.346475e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.254054e+05 1 256 +3.252183e+05 2 256 +3.259569e+05 4 256 +### CPU: scaling test 32 +3.498874e+05 1 32 +3.542076e+05 2 32 +3.198481e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.243613e+05 1 256 +2.351291e+05 2 256 +2.345114e+05 4 256 +### CPU: scaling test 32 +2.301860e+05 1 32 +2.329857e+05 2 32 +2.104986e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index cb0b82e9a4..d3f2e68af7 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,226 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:06:18 +DATE: 2025-10-11_15:28:08 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.883484e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.876597e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.990293e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.235119e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.971049e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.180643e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.458412 sec -INFO: No Floating Point Exceptions have been reported - 1,935,066,146 cycles # 2.866 GHz - 2,699,989,812 instructions # 1.40 insn per cycle - 0.733387527 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.464283 sec + 2,023,320,904 cycles # 2.839 GHz + 2,773,493,223 instructions # 1.37 insn per cycle + 0.771475737 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 38 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.013974e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.497451e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.709351e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.827739e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.997089e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.176442e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.541801 sec -INFO: No Floating Point Exceptions have been reported - 2,287,504,645 cycles # 2.883 GHz - 3,220,826,671 instructions # 1.41 insn per cycle - 0.850636557 seconds time elapsed +TOTAL : 0.537726 sec + 2,282,885,717 cycles # 2.817 GHz + 3,160,756,797 instructions # 1.38 insn per cycle + 0.868903156 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 -Avg ME (F77/GPU) = 0.14247482577104625 -Relative difference = 5.209967070245855e-07 +Avg ME (F77/GPU) = 0.14247482419639743 +Relative difference = 5.320488209618161e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.050634e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.073472e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.073472e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.042873e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.065099e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.065099e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.580404 sec -INFO: No Floating Point Exceptions have been reported - 4,643,189,098 cycles # 2.932 GHz - 13,180,741,468 instructions # 2.84 insn per cycle - 1.584505840 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.591072 sec + 4,638,115,400 cycles # 2.909 GHz + 13,236,410,026 instructions # 2.85 insn per cycle + 1.595277597 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.871761e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.941517e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.941517e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.832450e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.902450e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.902450e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.895197 sec -INFO: No Floating Point Exceptions have been reported - 2,647,990,030 cycles # 2.947 GHz - 7,474,565,418 instructions # 2.82 insn per cycle - 0.899253220 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3152) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.913352 sec + 2,653,863,508 cycles # 2.895 GHz + 7,455,424,096 instructions # 2.81 insn per cycle + 0.917427770 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 +Avg ME (F77/C++) = 0.14247482733329694 +Relative difference = 5.100316128927506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.201825e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.415489e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.415489e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.117188e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.318909e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.318909e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.532048 sec -INFO: No Floating Point Exceptions have been reported - 1,472,019,476 cycles # 2.748 GHz - 3,129,064,583 instructions # 2.13 insn per cycle - 0.536341858 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3133) (512y: 0) (512z: 0) +TOTAL : 0.545094 sec + 1,478,675,993 cycles # 2.696 GHz + 3,118,440,007 instructions # 2.11 insn per cycle + 0.549086981 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3060) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.569463e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.831852e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.831852e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.250725e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.471460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.471460e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.479328 sec -INFO: No Floating Point Exceptions have been reported - 1,320,483,901 cycles # 2.736 GHz - 2,983,197,107 instructions # 2.26 insn per cycle - 0.483280271 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2895) (512y: 110) (512z: 0) +TOTAL : 0.523896 sec + 1,401,490,342 cycles # 2.658 GHz + 2,993,266,123 instructions # 2.14 insn per cycle + 0.527885129 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2873) (512y: 90) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.268192e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.372574e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.372574e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.231374e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.335386e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.335386e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.745303 sec -INFO: No Floating Point Exceptions have been reported - 1,365,795,021 cycles # 1.824 GHz - 1,991,870,632 instructions # 1.46 insn per cycle - 0.749335143 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1679) (512y: 108) (512z: 2251) +TOTAL : 0.756616 sec + 1,324,382,086 cycles # 1.743 GHz + 1,938,261,257 instructions # 1.46 insn per cycle + 0.760681799 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1363) (512y: 70) (512z: 2196) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 222758fe32..7ec5b5c818 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,226 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:06:32 +DATE: 2025-10-11_15:28:30 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.879429e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.807541e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.902111e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.256105e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.967576e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.174354e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.462530 sec -INFO: No Floating Point Exceptions have been reported - 1,930,179,746 cycles # 2.847 GHz - 2,724,788,037 instructions # 1.41 insn per cycle - 0.736957830 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.463340 sec + 2,028,215,818 cycles # 2.846 GHz + 2,776,961,604 instructions # 1.37 insn per cycle + 0.769909609 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 38 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.958663e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.373563e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.576283e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.777604e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.905810e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.079424e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.543078 sec -INFO: No Floating Point Exceptions have been reported - 2,226,045,922 cycles # 2.831 GHz - 3,151,460,121 instructions # 1.42 insn per cycle - 0.843097781 seconds time elapsed +TOTAL : 0.537813 sec + 2,311,546,315 cycles # 2.847 GHz + 3,204,384,721 instructions # 1.39 insn per cycle + 0.869430768 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 -Avg ME (F77/GPU) = 0.14247482577104625 -Relative difference = 5.209967070245855e-07 +Avg ME (F77/GPU) = 0.14247482419639743 +Relative difference = 5.320488209618161e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.049471e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.072251e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.072251e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.027944e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.049964e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.049964e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.581795 sec -INFO: No Floating Point Exceptions have been reported - 4,647,850,638 cycles # 2.932 GHz - 13,168,659,581 instructions # 2.83 insn per cycle - 1.585735048 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 666) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.613580 sec + 4,641,772,345 cycles # 2.871 GHz + 13,214,748,096 instructions # 2.85 insn per cycle + 1.617579626 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 679) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.863863e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.934907e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.934907e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.824575e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893158e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893158e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.898950 sec -INFO: No Floating Point Exceptions have been reported - 2,647,565,316 cycles # 2.935 GHz - 7,477,127,209 instructions # 2.82 insn per cycle - 0.902852166 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3141) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.916995 sec + 2,647,231,235 cycles # 2.877 GHz + 7,451,993,603 instructions # 2.82 insn per cycle + 0.920907127 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3057) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 +Avg ME (F77/C++) = 0.14247482733329694 +Relative difference = 5.100316128927506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.193877e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.403471e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.403471e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.116778e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.320418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.320418e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.533092 sec -INFO: No Floating Point Exceptions have been reported - 1,474,101,191 cycles # 2.747 GHz - 3,129,731,788 instructions # 2.12 insn per cycle - 0.537323582 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3111) (512y: 0) (512z: 0) +TOTAL : 0.545336 sec + 1,472,587,180 cycles # 2.683 GHz + 3,116,400,718 instructions # 2.12 insn per cycle + 0.549340783 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3043) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.595782e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.860984e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.860984e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.223699e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.443094e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.443094e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.475643 sec -INFO: No Floating Point Exceptions have been reported - 1,319,166,719 cycles # 2.754 GHz - 2,983,572,989 instructions # 2.26 insn per cycle - 0.479589426 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2871) (512y: 110) (512z: 0) +TOTAL : 0.528265 sec + 1,399,996,992 cycles # 2.634 GHz + 2,990,999,773 instructions # 2.14 insn per cycle + 0.532237029 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2854) (512y: 90) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.265955e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.372021e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.372021e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.302312e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.410857e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.410857e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.745301 sec -INFO: No Floating Point Exceptions have been reported - 1,365,993,831 cycles # 1.825 GHz - 1,991,757,917 instructions # 1.46 insn per cycle - 0.749395729 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 108) (512z: 2251) +TOTAL : 0.733431 sec + 1,324,620,583 cycles # 1.798 GHz + 1,936,852,170 instructions # 1.46 insn per cycle + 0.737506511 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1344) (512y: 70) (512z: 2196) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 9b3f75797b..14462fa0eb 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-10-06_10:01:13 +DATE: 2025-10-11_17:04:42 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.147069e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.778623e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.394888e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.654485e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.404459e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.690060e+07 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.535117 sec -INFO: No Floating Point Exceptions have been reported - 2,222,375,781 cycles # 2.890 GHz - 3,181,150,200 instructions # 1.43 insn per cycle - 0.828824866 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 228 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.541401 sec + 2,305,332,177 cycles # 2.847 GHz + 3,197,913,952 instructions # 1.39 insn per cycle + 0.868100814 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 204 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134710926110280 Relative difference = 2.1036162329561614e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.628496e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.666122e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.666122e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.571130e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.606300e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.606300e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.554723 sec -INFO: No Floating Point Exceptions have been reported - 19,293,957,259 cycles # 2.941 GHz - 51,936,518,995 instructions # 2.69 insn per cycle - 6.561734499 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.786947 sec + 19,519,870,393 cycles # 2.875 GHz + 52,258,888,975 instructions # 2.68 insn per cycle + 6.792671431 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.914767e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.044981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.044981e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.857187e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.984563e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.984563e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.713846 sec -INFO: No Floating Point Exceptions have been reported - 10,942,394,234 cycles # 2.942 GHz - 30,809,451,561 instructions # 2.82 insn per cycle - 3.720459537 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2915) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.780938 sec + 10,994,068,173 cycles # 2.904 GHz + 30,917,710,259 instructions # 2.81 insn per cycle + 3.786765562 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.701521e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.038587e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.038587e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.468427e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.776131e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.776131e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.349075 sec -INFO: No Floating Point Exceptions have been reported - 6,518,044,155 cycles # 2.767 GHz - 13,691,830,614 instructions # 2.10 insn per cycle - 2.356266703 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2941) (512y: 0) (512z: 0) +TOTAL : 2.458667 sec + 6,708,728,258 cycles # 2.723 GHz + 13,712,517,378 instructions # 2.04 insn per cycle + 2.464482201 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2936) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.169544e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.582169e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.582169e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.847459e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.209715e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.209715e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.151067 sec -INFO: No Floating Point Exceptions have been reported - 5,973,431,908 cycles # 2.768 GHz - 13,032,735,919 instructions # 2.18 insn per cycle - 2.158817844 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2667) (512y: 146) (512z: 0) +TOTAL : 2.275732 sec + 6,180,724,079 cycles # 2.710 GHz + 13,193,237,105 instructions # 2.13 insn per cycle + 2.281442783 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2714) (512y: 126) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.442417e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.620453e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.620453e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.203485e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.355713e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.355713e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.166223 sec -INFO: No Floating Point Exceptions have been reported - 5,879,580,303 cycles # 1.853 GHz - 8,614,888,302 instructions # 1.47 insn per cycle - 3.173636028 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1506) (512y: 128) (512z: 1946) +TOTAL : 3.384877 sec + 5,997,535,040 cycles # 1.769 GHz + 8,705,216,175 instructions # 1.45 insn per cycle + 3.390523516 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1546) (512y: 106) (512z: 1954) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt index fe94934cb0..c1b909362e 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-10-06_10:01:39 +DATE: 2025-10-11_17:05:16 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.155696e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.811430e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.416776e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.602305e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.299861e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.572992e+07 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.531667 sec -INFO: No Floating Point Exceptions have been reported - 2,222,115,079 cycles # 2.893 GHz - 3,196,008,298 instructions # 1.44 insn per cycle - 0.825144177 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 216 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.543522 sec + 2,289,271,142 cycles # 2.817 GHz + 3,205,208,831 instructions # 1.40 insn per cycle + 0.870293269 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134710926110280 Relative difference = 2.1036162329561614e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.710634e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.751435e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.751435e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.653039e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.691951e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.691951e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.246482 sec -INFO: No Floating Point Exceptions have been reported - 18,390,828,933 cycles # 2.942 GHz - 50,070,723,541 instructions # 2.72 insn per cycle - 6.253313848 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.455303 sec + 18,685,885,377 cycles # 2.893 GHz + 50,237,697,539 instructions # 2.69 insn per cycle + 6.460495783 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 611) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.069031e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.214398e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.214398e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.954178e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.091326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.091326e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.535906 sec -INFO: No Floating Point Exceptions have been reported - 10,415,008,507 cycles # 2.940 GHz - 29,198,189,749 instructions # 2.80 insn per cycle - 3.543300262 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2733) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.661921 sec + 10,461,474,208 cycles # 2.853 GHz + 29,320,644,078 instructions # 2.80 insn per cycle + 3.667913174 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2712) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.327920e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.613203e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.613203e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.223646e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.500682e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.500682e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.541514 sec -INFO: No Floating Point Exceptions have been reported - 7,032,477,509 cycles # 2.760 GHz - 15,175,173,386 instructions # 2.16 insn per cycle - 2.548867076 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3020) (512y: 0) (512z: 0) +TOTAL : 2.594203 sec + 6,988,437,642 cycles # 2.689 GHz + 15,195,785,073 instructions # 2.17 insn per cycle + 2.599980482 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3011) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.529226e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.840126e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.840126e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.417064e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.714981e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.714981e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.433970 sec -INFO: No Floating Point Exceptions have been reported - 6,732,593,285 cycles # 2.759 GHz - 14,647,151,783 instructions # 2.18 insn per cycle - 2.441354685 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2621) (512y: 302) (512z: 0) +TOTAL : 2.485778 sec + 6,715,707,590 cycles # 2.696 GHz + 14,680,064,315 instructions # 2.19 insn per cycle + 2.491527768 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2612) (512y: 302) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.326729e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.490201e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.490201e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.163644e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.312325e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.312325e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.271504 sec -INFO: No Floating Point Exceptions have been reported - 6,070,928,941 cycles # 1.852 GHz - 10,360,391,243 instructions # 1.71 insn per cycle - 3.278977914 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1268) (512y: 214) (512z: 2129) +TOTAL : 3.425924 sec + 6,178,650,952 cycles # 1.801 GHz + 10,506,622,006 instructions # 1.70 insn per cycle + 3.431763355 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1317) (512y: 216) (512z: 2136) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index 8cd2c74f38..32d858512c 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,242 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-10-06_10:03:02 +DATE: 2025-10-11_17:06:56 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.465620e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.510965e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.608079e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.746430e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.525187e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.618301e+08 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.492110 sec -INFO: No Floating Point Exceptions have been reported - 2,084,727,455 cycles # 2.877 GHz - 2,955,736,176 instructions # 1.42 insn per cycle - 0.784112386 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 131 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.494982 sec + 2,135,489,785 cycles # 2.833 GHz + 2,986,554,714 instructions # 1.40 insn per cycle + 0.812364995 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 99 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 24 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313490e+00 -Avg ME (F77/GPU) = 4.3136695491848513 -Relative difference = 4.162503792787837e-05 +Avg ME (F77/GPU) = 4.3136695760767907 +Relative difference = 4.1631272308702715e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.686557e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.727704e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.727704e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.639930e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.679722e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.679722e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 6.305463 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 18,635,302,225 cycles # 2.953 GHz - 51,219,407,083 instructions # 2.75 insn per cycle - 6.310992251 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 625) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.483754 sec + 18,765,516,643 cycles # 2.893 GHz + 51,374,423,413 instructions # 2.74 insn per cycle + 6.489228485 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313574e+00 Avg ME (F77/C++) = 4.3135738277342170 Relative difference = 3.9935743068669333e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.043062e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.307407e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.307407e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.904149e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.155838e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.155838e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.681205 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 7,932,523,130 cycles # 2.953 GHz - 19,317,767,787 instructions # 2.44 insn per cycle - 2.686665617 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3542) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.775203 sec + 8,009,571,813 cycles # 2.881 GHz + 19,418,906,078 instructions # 2.42 insn per cycle + 2.780526828 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3524) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313572e+00 Avg ME (F77/C++) = 4.3135722697479650 Relative difference = 6.253470796314402e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.901471e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.926003e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.926003e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.670886e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.626596e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.626596e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.413719 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,953,020,388 cycles # 2.786 GHz - 8,832,668,299 instructions # 2.23 insn per cycle - 1.419629254 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3715) (512y: 0) (512z: 0) +TOTAL : 1.456000 sec + 3,972,178,441 cycles # 2.719 GHz + 8,869,239,722 instructions # 2.23 insn per cycle + 1.461741307 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3709) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645242873579 -Relative difference = 1.1028294269894893e-07 +Avg ME (F77/C++) = 4.3135645270813257 +Relative difference = 1.096352260831459e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.392997e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.544307e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.544307e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.928240e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.948874e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.948874e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.337803 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,735,491,375 cycles # 2.782 GHz - 8,430,906,889 instructions # 2.26 insn per cycle - 1.343508069 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3541) (512y: 20) (512z: 0) +TOTAL : 1.411952 sec + 3,818,419,324 cycles # 2.695 GHz + 8,547,519,956 instructions # 2.24 insn per cycle + 1.417398798 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3594) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645242873579 -Relative difference = 1.1028294269894893e-07 +Avg ME (F77/C++) = 4.3135645270813257 +Relative difference = 1.096352260831459e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.024352e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.578236e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.578236e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.574912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.065441e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.065441e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.827995 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,508,723,607 cycles # 1.915 GHz - 6,244,798,669 instructions # 1.78 insn per cycle - 1.833521857 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2325) (512y: 22) (512z: 2290) +TOTAL : 1.971243 sec + 3,626,432,325 cycles # 1.835 GHz + 6,319,513,510 instructions # 1.74 insn per cycle + 1.976911767 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2377) (512y: 0) (512z: 2299) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313564e+00 -Avg ME (F77/C++) = 4.3135643536224961 -Relative difference = 8.197919301304478e-08 +Avg ME (F77/C++) = 4.3135642320849001 +Relative difference = 5.380351369373482e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt index 1ff1d26090..218c8378c2 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,246 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-10-06_10:03:24 +DATE: 2025-10-11_17:07:25 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.690902e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.615208e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.727767e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.779658e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.535884e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.628235e+08 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.493976 sec -INFO: No Floating Point Exceptions have been reported - 2,066,790,877 cycles # 2.843 GHz - 2,969,404,210 instructions # 1.44 insn per cycle - 0.785535997 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 125 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.493747 sec + 2,136,570,540 cycles # 2.832 GHz + 2,955,252,814 instructions # 1.38 insn per cycle + 0.811353108 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 100 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 24 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313490e+00 -Avg ME (F77/GPU) = 4.3136695491848513 -Relative difference = 4.162503792787837e-05 +Avg ME (F77/GPU) = 4.3136695760767907 +Relative difference = 4.1631272308702715e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.736131e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.779781e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.779781e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.693969e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.736524e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.736524e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 6.127979 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 18,032,140,147 cycles # 2.940 GHz - 49,602,643,371 instructions # 2.75 insn per cycle - 6.133935412 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 613) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.279316 sec + 18,165,491,134 cycles # 2.891 GHz + 49,676,906,698 instructions # 2.73 insn per cycle + 6.284692119 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 607) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313574e+00 Avg ME (F77/C++) = 4.3135738277342170 Relative difference = 3.9935743068669333e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.506367e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.839198e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.839198e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.443862e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.778187e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.778187e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.414203 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 7,115,995,603 cycles # 2.942 GHz - 18,533,869,751 instructions # 2.60 insn per cycle - 2.419892180 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3252) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.449024 sec + 7,084,328,481 cycles # 2.887 GHz + 18,582,770,693 instructions # 2.62 insn per cycle + 2.454447463 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3222) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313572e+00 Avg ME (F77/C++) = 4.3135722697479650 Relative difference = 6.253470796314402e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.374488e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.825683e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.825683e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.216367e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.641236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.641236e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.037733 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 5,644,200,229 cycles # 2.763 GHz - 10,848,148,808 instructions # 1.92 insn per cycle - 2.043741542 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4274) (512y: 0) (512z: 0) +TOTAL : 2.098866 sec + 5,652,855,011 cycles # 2.688 GHz + 10,909,770,006 instructions # 1.93 insn per cycle + 2.104181652 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4283) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645242873579 -Relative difference = 1.1028294269894893e-07 +Avg ME (F77/C++) = 4.3135645270813257 +Relative difference = 1.096352260831459e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.433283e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.894901e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.894901e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.314509e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.753400e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.753400e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.017462 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 5,594,464,289 cycles # 2.767 GHz - 10,554,918,385 instructions # 1.89 insn per cycle - 2.022782231 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4138) (512y: 12) (512z: 0) +TOTAL : 2.062043 sec + 5,590,274,103 cycles # 2.706 GHz + 10,617,976,090 instructions # 1.90 insn per cycle + 2.067292425 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4142) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645242873579 -Relative difference = 1.1028294269894893e-07 +Avg ME (F77/C++) = 4.3135645270813257 +Relative difference = 1.096352260831459e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.364066e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.648223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.648223e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.151626e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.412256e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.412256e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.491143 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 4,639,687,839 cycles # 1.859 GHz - 8,661,216,579 instructions # 1.87 insn per cycle - 2.496647539 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2799) (512y: 0) (512z: 2885) +TOTAL : 2.614832 sec + 4,741,117,769 cycles # 1.810 GHz + 8,743,372,129 instructions # 1.84 insn per cycle + 2.620465706 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2850) (512y: 0) (512z: 2889) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313564e+00 -Avg ME (F77/C++) = 4.3135643536224961 -Relative difference = 8.197919301304478e-08 +Avg ME (F77/C++) = 4.3135642320849001 +Relative difference = 5.380351369373482e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index 12c9da87af..f4ff8c446a 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,246 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-10-06_10:02:06 +DATE: 2025-10-11_17:05:47 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.131914e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.755854e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.359452e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.626534e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.403274e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.688448e+07 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.535565 sec -INFO: No Floating Point Exceptions have been reported - 2,204,224,001 cycles # 2.864 GHz - 3,121,247,303 instructions # 1.42 insn per cycle - 0.828499405 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 228 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.543452 sec + 2,301,166,740 cycles # 2.836 GHz + 3,210,334,164 instructions # 1.40 insn per cycle + 0.870784678 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 204 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 -Avg ME (F77/GPU) = 4.3134711012809239 -Relative difference = 2.0835166567625394e-07 +Avg ME (F77/GPU) = 4.3134712619343958 +Relative difference = 1.711070812999077e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.529079e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.561968e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.561968e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.489645e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.521138e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.521138e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.973239 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 20,550,139,482 cycles # 2.945 GHz - 51,941,635,065 instructions # 2.53 insn per cycle - 6.980082779 seconds time elapsed +TOTAL : 7.151635 sec + 20,539,261,330 cycles # 2.870 GHz + 52,312,072,955 instructions # 2.55 insn per cycle + 7.157317940 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778082178 -Relative difference = 1.906102050071626e-07 +Avg ME (F77/C++) = 4.3134711782756741 +Relative difference = 1.9050183377028104e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.672019e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.782339e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.782339e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.635024e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.743558e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.743558e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 4.043433 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 11,521,778,322 cycles # 2.845 GHz - 30,615,090,868 instructions # 2.66 insn per cycle - 4.050715703 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2972) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.091108 sec + 11,568,480,565 cycles # 2.825 GHz + 30,592,470,506 instructions # 2.64 insn per cycle + 4.096724147 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778082178 -Relative difference = 1.906102050071626e-07 +Avg ME (F77/C++) = 4.3134711778081822 +Relative difference = 1.9061021324348284e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.474164e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.781347e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.781347e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.442158e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.748594e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.748594e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.469295 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,758,530,167 cycles # 2.729 GHz - 13,653,357,404 instructions # 2.02 insn per cycle - 2.477625143 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3118) (512y: 0) (512z: 0) +TOTAL : 2.473093 sec + 6,663,246,815 cycles # 2.689 GHz + 13,582,195,938 instructions # 2.04 insn per cycle + 2.478977008 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712319139954 -Relative difference = 1.7806676491157786e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.946193e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.312777e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.312777e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.658370e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.993226e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.993226e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.239110 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,206,585,291 cycles # 2.765 GHz - 13,005,835,459 instructions # 2.10 insn per cycle - 2.246664710 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2851) (512y: 150) (512z: 0) +TOTAL : 2.362618 sec + 6,353,039,315 cycles # 2.684 GHz + 13,072,016,547 instructions # 2.06 insn per cycle + 2.368607155 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2867) (512y: 130) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712319139954 -Relative difference = 1.7806676491157786e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.130780e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.276017e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.276017e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.116355e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.262209e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.262209e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.470623 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,429,525,372 cycles # 1.849 GHz - 8,729,822,669 instructions # 1.36 insn per cycle - 3.478318009 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1792) (512y: 130) (512z: 2014) +TOTAL : 3.476875 sec + 6,216,987,973 cycles # 1.786 GHz + 8,426,779,606 instructions # 1.36 insn per cycle + 3.483074770 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1598) (512y: 96) (512z: 1978) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712319139954 -Relative difference = 1.7806676491157786e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt index 90c964242c..f78a78f7e9 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,246 +10,213 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-10-06_10:02:34 +DATE: 2025-10-11_17:06:21 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.143359e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.817002e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.430401e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.581022e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.292223e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.567393e+07 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.533031 sec -INFO: No Floating Point Exceptions have been reported - 2,222,154,822 cycles # 2.885 GHz - 3,215,427,054 instructions # 1.45 insn per cycle - 0.826924367 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 216 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.541711 sec + 2,303,336,148 cycles # 2.840 GHz + 3,222,227,466 instructions # 1.40 insn per cycle + 0.868265701 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 -Avg ME (F77/GPU) = 4.3134711012809239 -Relative difference = 2.0835166567625394e-07 +Avg ME (F77/GPU) = 4.3134712619343958 +Relative difference = 1.711070812999077e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.616471e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.652773e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.652773e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.563907e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.598575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.598575e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.603326 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 19,494,406,226 cycles # 2.950 GHz - 49,966,413,800 instructions # 2.56 insn per cycle - 6.609959024 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 599) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.817167 sec + 19,709,237,083 cycles # 2.890 GHz + 50,290,409,188 instructions # 2.55 insn per cycle + 6.822753554 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 611) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778082178 -Relative difference = 1.906102050071626e-07 +Avg ME (F77/C++) = 4.3134711782756741 +Relative difference = 1.9050183377028104e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.890177e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.018164e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.018164e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.841525e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.969254e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.969254e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.745798 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 11,068,643,232 cycles # 2.950 GHz - 29,164,471,893 instructions # 2.63 insn per cycle - 3.753005329 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2815) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.802477 sec + 11,003,460,648 cycles # 2.890 GHz + 29,103,019,269 instructions # 2.64 insn per cycle + 3.808301655 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2766) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778082178 -Relative difference = 1.906102050071626e-07 +Avg ME (F77/C++) = 4.3134711778081822 +Relative difference = 1.9061021324348284e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.744994e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.955254e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.955254e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.769392e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.987989e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.987989e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.917714 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 8,087,123,435 cycles # 2.766 GHz - 15,210,355,188 instructions # 1.88 insn per cycle - 2.924634632 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3203) (512y: 0) (512z: 0) +TOTAL : 2.893528 sec + 7,880,875,441 cycles # 2.719 GHz + 15,079,012,118 instructions # 1.91 insn per cycle + 2.899352011 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3163) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712319139954 -Relative difference = 1.7806676491157786e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.909194e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.140218e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.140218e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.967773e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.208568e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.208568e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.798673 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 7,730,347,780 cycles # 2.756 GHz - 14,498,978,915 instructions # 1.88 insn per cycle - 2.805768338 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2775) (512y: 304) (512z: 0) +TOTAL : 2.753936 sec + 7,508,856,368 cycles # 2.722 GHz + 14,417,603,283 instructions # 1.92 insn per cycle + 2.759752652 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2737) (512y: 304) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712319139954 -Relative difference = 1.7806676491157786e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.049249e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.186111e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.186111e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.068489e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.209462e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.209462e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.561293 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,578,699,260 cycles # 1.844 GHz - 9,927,155,424 instructions # 1.51 insn per cycle - 3.569129809 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1565) (512y: 216) (512z: 2216) +TOTAL : 3.528645 sec + 6,308,539,404 cycles # 1.786 GHz + 9,645,872,961 instructions # 1.53 insn per cycle + 3.534370742 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1371) (512y: 204) (512z: 2172) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712319139954 -Relative difference = 1.7806676491157786e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index 2b34ea67ad..b64bd08c6e 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-10-06_10:00:07 +DATE: 2025-10-11_17:02:19 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.760509e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.779507e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.782702e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.749715e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.123100e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.185595e+05 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.473450 sec -INFO: No Floating Point Exceptions have been reported - 1,994,326,240 cycles # 2.874 GHz - 2,845,102,706 instructions # 1.43 insn per cycle - 0.753810347 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.460632 sec + 2,016,310,298 cycles # 2.828 GHz + 2,811,062,777 instructions # 1.39 insn per cycle + 0.771405460 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.019067e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.126130e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.133988e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.798297e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.902790e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.910598e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.490494 sec -INFO: No Floating Point Exceptions have been reported - 2,031,600,016 cycles # 2.857 GHz - 2,995,319,726 instructions # 1.47 insn per cycle - 0.772627668 seconds time elapsed +TOTAL : 0.483683 sec + 2,080,405,450 cycles # 2.828 GHz + 2,919,633,235 instructions # 1.40 insn per cycle + 0.795243442 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 Avg ME (F77/GPU) = 8.1274562860176604E-006 Relative difference = 3.3392753366481633e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.383469e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.386752e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.386752e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.386932e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.390193e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.390193e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.158264 sec -INFO: No Floating Point Exceptions have been reported - 469,342,334 cycles # 2.906 GHz - 1,390,298,076 instructions # 2.96 insn per cycle - 0.162106230 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3908) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.158198 sec + 459,847,306 cycles # 2.852 GHz + 1,381,276,044 instructions # 3.00 insn per cycle + 0.161817794 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1508) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167185E-006 Relative difference = 3.339276495559746e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.476358e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.488167e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.488167e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.255945e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.267065e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.267065e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.083612 sec -INFO: No Floating Point Exceptions have been reported - 240,584,825 cycles # 2.769 GHz - 693,113,903 instructions # 2.88 insn per cycle - 0.087424946 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9482) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.086223 sec + 240,474,211 cycles # 2.695 GHz + 691,658,857 instructions # 2.88 insn per cycle + 0.089852973 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9332) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167168E-006 Relative difference = 3.3392764976441195e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.432068e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.438681e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.438681e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.385213e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.390914e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.390914e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.038906 sec -INFO: No Floating Point Exceptions have been reported - 114,140,366 cycles # 2.711 GHz - 257,891,266 instructions # 2.26 insn per cycle - 0.042661267 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8501) (512y: 0) (512z: 0) +TOTAL : 0.040134 sec + 114,132,005 cycles # 2.644 GHz + 258,038,380 instructions # 2.26 insn per cycle + 0.043763583 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8583) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.618386e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.625883e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.625883e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.538966e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.546528e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.546528e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.034671 sec -INFO: No Floating Point Exceptions have been reported - 102,555,024 cycles # 2.705 GHz - 240,017,026 instructions # 2.34 insn per cycle - 0.038425016 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8143) (512y: 150) (512z: 0) +TOTAL : 0.036228 sec + 103,692,755 cycles # 2.641 GHz + 240,622,200 instructions # 2.32 insn per cycle + 0.039728552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8271) (512y: 130) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.192893e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198052e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.198052e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.148417e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.153199e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.153199e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.046494 sec -INFO: No Floating Point Exceptions have been reported - 90,048,800 cycles # 1.806 GHz - 134,302,710 instructions # 1.49 insn per cycle - 0.050438224 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1943) (512y: 126) (512z: 7086) +TOTAL : 0.048211 sec + 90,387,142 cycles # 1.755 GHz + 134,612,621 instructions # 1.49 insn per cycle + 0.052002771 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2130) (512y: 104) (512z: 7074) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt index dc41fe503f..4db43dd255 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-10-06_10:00:18 +DATE: 2025-10-11_17:02:42 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.797107e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.816023e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.819423e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.803202e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.181220e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.245341e+05 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.469966 sec -INFO: No Floating Point Exceptions have been reported - 2,001,057,465 cycles # 2.881 GHz - 2,930,552,926 instructions # 1.46 insn per cycle - 0.752195966 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.458543 sec + 2,011,139,566 cycles # 2.825 GHz + 2,801,263,226 instructions # 1.39 insn per cycle + 0.769027350 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.121137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.233030e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.241027e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.788680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.895418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.902637e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.489610 sec -INFO: No Floating Point Exceptions have been reported - 2,050,200,483 cycles # 2.873 GHz - 3,056,241,818 instructions # 1.49 insn per cycle - 0.771808178 seconds time elapsed +TOTAL : 0.483711 sec + 2,072,169,922 cycles # 2.815 GHz + 2,948,772,929 instructions # 1.42 insn per cycle + 0.795276590 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 Avg ME (F77/GPU) = 8.1274562860176604E-006 Relative difference = 3.3392753366481633e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.406266e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.409565e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.409565e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.383885e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.387148e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.387148e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.156466 sec -INFO: No Floating Point Exceptions have been reported - 465,689,745 cycles # 2.917 GHz - 1,385,079,930 instructions # 2.97 insn per cycle - 0.160315659 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3796) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.157412 sec + 457,302,712 cycles # 2.851 GHz + 1,376,801,855 instructions # 3.01 insn per cycle + 0.160964317 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1502) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167185E-006 Relative difference = 3.339276495559746e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.388983e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.401822e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.401822e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.288759e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.301116e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.301116e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.083799 sec -INFO: No Floating Point Exceptions have been reported - 238,961,924 cycles # 2.745 GHz - 689,073,758 instructions # 2.88 insn per cycle - 0.087593094 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9525) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.085024 sec + 238,495,422 cycles # 2.707 GHz + 687,028,266 instructions # 2.88 insn per cycle + 0.088746242 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9384) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167168E-006 Relative difference = 3.3392764976441195e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.419818e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.425419e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.425419e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.395926e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.401596e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.401596e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.038479 sec -INFO: No Floating Point Exceptions have been reported - 111,800,811 cycles # 2.682 GHz - 253,484,287 instructions # 2.27 insn per cycle - 0.042138594 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8457) (512y: 0) (512z: 0) +TOTAL : 0.039010 sec + 112,073,428 cycles # 2.662 GHz + 253,139,110 instructions # 2.26 insn per cycle + 0.042677736 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8538) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.620452e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.628839e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.628839e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.525855e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.532589e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.532589e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.033872 sec -INFO: No Floating Point Exceptions have been reported - 100,998,379 cycles # 2.706 GHz - 235,641,730 instructions # 2.33 insn per cycle - 0.037957581 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8101) (512y: 150) (512z: 0) +TOTAL : 0.035869 sec + 101,601,884 cycles # 2.611 GHz + 235,894,497 instructions # 2.32 insn per cycle + 0.039518260 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8224) (512y: 130) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.156678e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.161477e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.161477e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.142399e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.147704e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147704e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.047111 sec -INFO: No Floating Point Exceptions have been reported - 88,066,978 cycles # 1.743 GHz - 129,735,533 instructions # 1.47 insn per cycle - 0.051105123 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1899) (512y: 126) (512z: 7084) +TOTAL : 0.047633 sec + 88,136,356 cycles # 1.737 GHz + 129,828,247 instructions # 1.47 insn per cycle + 0.051419113 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2084) (512y: 104) (512z: 7074) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 4b10dcf1d1..5211bad1d2 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-10-06_10:00:52 +DATE: 2025-10-11_17:03:51 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.214342e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.224285e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.226222e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.302427e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.704300e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.791284e+05 ) sec^-1 MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 -TOTAL : 0.476842 sec -INFO: No Floating Point Exceptions have been reported - 1,989,613,876 cycles # 2.873 GHz - 2,928,089,356 instructions # 1.47 insn per cycle - 0.750924959 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.462607 sec + 2,015,593,801 cycles # 2.836 GHz + 2,784,970,796 instructions # 1.38 insn per cycle + 0.770212174 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.950242e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.029144e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.036217e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.020494e-03 +- 4.025605e-03 ) GeV^-4 -TOTAL : 0.473909 sec -INFO: No Floating Point Exceptions have been reported - 1,995,145,721 cycles # 2.886 GHz - 2,912,342,089 instructions # 1.46 insn per cycle - 0.748274226 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.169898e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.187942e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.190235e+06 ) sec^-1 +MeanMatrixElemValue = ( 8.020495e-03 +- 4.025605e-03 ) GeV^-4 +TOTAL : 0.469557 sec + 2,042,790,873 cycles # 2.836 GHz + 2,884,156,824 instructions # 1.41 insn per cycle + 0.777382571 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127250e-06 -Avg ME (F77/GPU) = 8.1272869669930272E-006 -Relative difference = 4.548524165778887e-06 +Avg ME (F77/GPU) = 8.1272869086972111E-006 +Relative difference = 4.541351282443064e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.462777e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.466245e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.466245e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.579211e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.582825e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.582825e+03 ) sec^-1 MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.154509 sec -INFO: No Floating Point Exceptions have been reported - 463,950,135 cycles # 2.942 GHz - 1,382,102,782 instructions # 2.98 insn per cycle - 0.158280886 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3058) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.149618 sec + 441,460,345 cycles # 2.891 GHz + 1,357,431,891 instructions # 3.07 insn per cycle + 0.153196109 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105271212486E-006 -Relative difference = 5.8180333155894157e-08 +Avg ME (F77/C++) = 8.1278105256181649E-006 +Relative difference = 5.836526409016727e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.221716e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.226773e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.226773e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.178631e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.183684e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.183684e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.045148 sec -INFO: No Floating Point Exceptions have been reported - 132,927,826 cycles # 2.743 GHz - 372,156,154 instructions # 2.80 insn per cycle - 0.049041087 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:10141) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.046713 sec + 133,037,126 cycles # 2.662 GHz + 371,430,035 instructions # 2.79 insn per cycle + 0.050453436 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9988) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127809e-06 Avg ME (F77/C++) = 8.1278090510674588E-006 Relative difference = 6.2830535070193674e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.776220e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.801025e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.801025e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.599910e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.621223e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.621223e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.021005 sec -INFO: No Floating Point Exceptions have been reported - 65,153,242 cycles # 2.690 GHz - 142,838,093 instructions # 2.19 insn per cycle - 0.024771930 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9241) (512y: 0) (512z: 0) +TOTAL : 0.022499 sec + 65,701,477 cycles # 2.576 GHz + 142,904,938 instructions # 2.18 insn per cycle + 0.026069649 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9322) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127537e-06 Avg ME (F77/C++) = 8.1275366216540664E-006 Relative difference = 4.655111786058001e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.070417e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.098717e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.098717e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.684576e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.708888e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.708888e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.019184 sec -INFO: No Floating Point Exceptions have been reported - 60,296,621 cycles # 2.678 GHz - 132,772,434 instructions # 2.20 insn per cycle - 0.023065155 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8959) (512y: 28) (512z: 0) +TOTAL : 0.021728 sec + 60,421,247 cycles # 2.428 GHz + 133,158,601 instructions # 2.20 insn per cycle + 0.025465207 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9093) (512y: 8) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127537e-06 Avg ME (F77/C++) = 8.1275366216540664E-006 Relative difference = 4.655111786058001e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.324469e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.345673e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.345673e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.239020e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.260813e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.260813e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.024875 sec -INFO: No Floating Point Exceptions have been reported - 52,411,208 cycles # 1.857 GHz - 79,637,147 instructions # 1.52 insn per cycle - 0.028776798 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2836) (512y: 30) (512z: 7437) +TOTAL : 0.025827 sec + 52,150,255 cycles # 1.790 GHz + 79,743,681 instructions # 1.53 insn per cycle + 0.029792364 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3031) (512y: 8) (512z: 7424) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127537e-06 Avg ME (F77/C++) = 8.1275369863475849E-006 Relative difference = 1.6797726498700304e-09 diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt index 67a7328c67..c79acb423d 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-10-06_10:01:02 +DATE: 2025-10-11_17:04:20 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.235104e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.244507e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.246621e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.351614e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.802263e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.888038e+05 ) sec^-1 MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 -TOTAL : 0.477845 sec -INFO: No Floating Point Exceptions have been reported - 1,997,911,903 cycles # 2.876 GHz - 2,886,764,809 instructions # 1.44 insn per cycle - 0.753229194 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.458224 sec + 1,995,767,929 cycles # 2.816 GHz + 2,740,980,318 instructions # 1.37 insn per cycle + 0.766478985 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.096496e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.193422e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.206590e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.181811e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.198606e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.200307e+06 ) sec^-1 MeanMatrixElemValue = ( 8.020496e-03 +- 4.025606e-03 ) GeV^-4 -TOTAL : 0.477844 sec -INFO: No Floating Point Exceptions have been reported - 2,000,227,335 cycles # 2.879 GHz - 2,887,661,973 instructions # 1.44 insn per cycle - 0.753759254 seconds time elapsed +TOTAL : 0.469407 sec + 2,020,295,671 cycles # 2.810 GHz + 2,851,658,754 instructions # 1.41 insn per cycle + 0.776046944 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127250e-06 -Avg ME (F77/GPU) = 8.1272866419447706E-006 -Relative difference = 4.508529302013153e-06 +Avg ME (F77/GPU) = 8.1272867096445498E-006 +Relative difference = 4.516859275763117e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.435869e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.439325e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.439325e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.154994 sec -INFO: No Floating Point Exceptions have been reported - 461,652,768 cycles # 2.918 GHz - 1,376,807,565 instructions # 2.98 insn per cycle - 0.158786297 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2930) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.511421e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.515116e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.515116e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 +TOTAL : 0.151755 sec + 446,437,299 cycles # 2.884 GHz + 1,359,153,558 instructions # 3.04 insn per cycle + 0.155354916 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1960) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105271212486E-006 -Relative difference = 5.8180333155894157e-08 +Avg ME (F77/C++) = 8.1278105326147384E-006 +Relative difference = 5.7504445173550794e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.215601e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.220158e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.220158e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.180553e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.185062e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.185062e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.044587 sec -INFO: No Floating Point Exceptions have been reported - 130,364,411 cycles # 2.725 GHz - 367,274,419 instructions # 2.82 insn per cycle - 0.048380365 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:10124) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.045862 sec + 130,422,574 cycles # 2.664 GHz + 366,713,009 instructions # 2.81 insn per cycle + 0.049604747 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9971) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127809e-06 Avg ME (F77/C++) = 8.1278090510674588E-006 Relative difference = 6.2830535070193674e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.799777e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.825160e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.825160e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.692821e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.714744e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.714744e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.020201 sec -INFO: No Floating Point Exceptions have been reported - 63,211,215 cycles # 2.704 GHz - 138,063,768 instructions # 2.18 insn per cycle - 0.023985955 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9196) (512y: 0) (512z: 0) +TOTAL : 0.020805 sec + 63,132,535 cycles # 2.647 GHz + 138,133,867 instructions # 2.19 insn per cycle + 0.024434416 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9272) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127537e-06 Avg ME (F77/C++) = 8.1275366216540664E-006 Relative difference = 4.655111786058001e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.035669e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.062918e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.062918e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.972359e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.000309e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.000309e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.018625 sec -INFO: No Floating Point Exceptions have been reported - 57,993,332 cycles # 2.658 GHz - 127,990,808 instructions # 2.21 insn per cycle - 0.022353301 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8910) (512y: 28) (512z: 0) +TOTAL : 0.019005 sec + 58,481,038 cycles # 2.633 GHz + 128,386,986 instructions # 2.20 insn per cycle + 0.022679122 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9045) (512y: 8) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127537e-06 Avg ME (F77/C++) = 8.1275366216540664E-006 Relative difference = 4.655111786058001e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.344103e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.363443e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.363443e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.272413e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.292411e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.292411e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.024010 sec -INFO: No Floating Point Exceptions have been reported - 50,268,269 cycles # 1.840 GHz - 74,785,740 instructions # 1.49 insn per cycle - 0.027917015 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2791) (512y: 30) (512z: 7439) +TOTAL : 0.024623 sec + 50,322,119 cycles # 1.806 GHz + 74,992,557 instructions # 1.49 insn per cycle + 0.028526790 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2983) (512y: 8) (512z: 7425) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127537e-06 Avg ME (F77/C++) = 8.1275369863475849E-006 Relative difference = 1.6797726498700304e-09 diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index 50cf2d796e..c43ff17d3c 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-10-06_10:00:29 +DATE: 2025-10-11_17:03:05 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.754018e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.771557e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.774637e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.763173e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.125938e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.192941e+05 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.469482 sec -INFO: No Floating Point Exceptions have been reported - 1,992,256,665 cycles # 2.872 GHz - 2,888,484,617 instructions # 1.45 insn per cycle - 0.750839241 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.458247 sec + 2,022,321,141 cycles # 2.816 GHz + 2,799,483,258 instructions # 1.38 insn per cycle + 0.774798224 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.962737e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.089994e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.098896e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.755571e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.866016e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.873910e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.485052 sec -INFO: No Floating Point Exceptions have been reported - 2,027,704,407 cycles # 2.871 GHz - 3,029,735,278 instructions # 1.49 insn per cycle - 0.765353713 seconds time elapsed +TOTAL : 0.484676 sec + 2,078,557,296 cycles # 2.829 GHz + 2,897,976,393 instructions # 1.39 insn per cycle + 0.794258904 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 -Avg ME (F77/GPU) = 8.1274562879405200E-006 -Relative difference = 3.3369094561706885e-07 +Avg ME (F77/GPU) = 8.1274562122604674E-006 +Relative difference = 3.4300259549904373e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.401289e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.404577e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.404577e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.388630e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.392004e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.392004e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.157429 sec -INFO: No Floating Point Exceptions have been reported - 471,621,611 cycles # 2.936 GHz - 1,398,387,891 instructions # 2.97 insn per cycle - 0.161191989 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3899) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.157940 sec + 464,903,592 cycles # 2.886 GHz + 1,389,803,957 instructions # 2.99 insn per cycle + 0.161593391 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1508) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562948736117E-006 Relative difference = 3.32837900190667e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.729709e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.743939e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.743939e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.572359e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.584503e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.584503e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.080446 sec -INFO: No Floating Point Exceptions have been reported - 237,178,815 cycles # 2.833 GHz - 688,220,781 instructions # 2.90 insn per cycle - 0.084309693 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9334) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.082287 sec + 236,914,725 cycles # 2.777 GHz + 687,861,027 instructions # 2.90 insn per cycle + 0.085920826 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563175290919E-006 Relative difference = 3.3005037703909805e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.409119e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.415451e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.415451e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.419898e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.425632e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.425632e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.039507 sec -INFO: No Floating Point Exceptions have been reported - 114,068,471 cycles # 2.665 GHz - 253,096,543 instructions # 2.22 insn per cycle - 0.043335126 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8363) (512y: 0) (512z: 0) +TOTAL : 0.039368 sec + 113,570,815 cycles # 2.680 GHz + 253,055,756 instructions # 2.23 insn per cycle + 0.042992839 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8121) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.680681e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.688641e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.688641e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.595281e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.602693e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.602693e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.033493 sec -INFO: No Floating Point Exceptions have been reported - 101,334,967 cycles # 2.753 GHz - 233,610,113 instructions # 2.31 insn per cycle - 0.037380618 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7501) (512y: 146) (512z: 0) +TOTAL : 0.035105 sec + 102,173,670 cycles # 2.666 GHz + 233,820,968 instructions # 2.29 insn per cycle + 0.038810282 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7314) (512y: 126) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.194656e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.199944e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.199944e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.158210e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.163544e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.163544e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.046435 sec -INFO: No Floating Point Exceptions have been reported - 91,210,419 cycles # 1.827 GHz - 133,172,431 instructions # 1.46 insn per cycle - 0.050429905 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2084) (512y: 122) (512z: 6354) +TOTAL : 0.047815 sec + 89,915,156 cycles # 1.766 GHz + 131,317,903 instructions # 1.46 insn per cycle + 0.051535880 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1995) (512y: 100) (512z: 6276) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt index e1fc789bed..d6a9bd8585 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-10-06_10:00:41 +DATE: 2025-10-11_17:03:28 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.793622e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.811451e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.814397e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.669359e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.024328e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.088471e+05 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.470592 sec -INFO: No Floating Point Exceptions have been reported - 1,997,502,547 cycles # 2.880 GHz - 2,923,476,215 instructions # 1.46 insn per cycle - 0.750818094 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.459467 sec + 2,006,632,193 cycles # 2.818 GHz + 2,802,302,686 instructions # 1.40 insn per cycle + 0.769563513 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.055830e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.165646e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.173712e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.797271e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.897088e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.904896e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.491474 sec -INFO: No Floating Point Exceptions have been reported - 2,044,918,526 cycles # 2.859 GHz - 3,006,189,896 instructions # 1.47 insn per cycle - 0.774360899 seconds time elapsed +TOTAL : 0.485964 sec + 2,085,949,128 cycles # 2.828 GHz + 2,970,232,534 instructions # 1.42 insn per cycle + 0.796151358 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 -Avg ME (F77/GPU) = 8.1274562879405200E-006 -Relative difference = 3.3369094561706885e-07 +Avg ME (F77/GPU) = 8.1274562122604674E-006 +Relative difference = 3.4300259549904373e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.402707e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.406541e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.406541e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.393388e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.396682e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.396682e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.156609 sec -INFO: No Floating Point Exceptions have been reported - 468,766,259 cycles # 2.933 GHz - 1,393,706,102 instructions # 2.97 insn per cycle - 0.160398151 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3800) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.156959 sec + 461,726,786 cycles # 2.887 GHz + 1,385,347,614 instructions # 3.00 insn per cycle + 0.160462326 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1502) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562948736117E-006 Relative difference = 3.32837900190667e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.728046e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.740604e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.740604e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.599813e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.612219e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.612219e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.079699 sec -INFO: No Floating Point Exceptions have been reported - 235,148,851 cycles # 2.837 GHz - 684,201,633 instructions # 2.91 insn per cycle - 0.083458032 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9368) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.081200 sec + 234,522,151 cycles # 2.781 GHz + 683,124,885 instructions # 2.91 insn per cycle + 0.084930246 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9100) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563175290919E-006 Relative difference = 3.3005037703909805e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.447554e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.453499e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.453499e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.420930e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.426598e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.426598e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.037781 sec -INFO: No Floating Point Exceptions have been reported - 111,660,471 cycles # 2.716 GHz - 248,651,696 instructions # 2.23 insn per cycle - 0.041691428 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8316) (512y: 0) (512z: 0) +TOTAL : 0.038386 sec + 111,202,178 cycles # 2.675 GHz + 248,277,259 instructions # 2.23 insn per cycle + 0.042154353 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8074) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.634149e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.641617e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.641617e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.570276e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.578064e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.578064e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.033571 sec -INFO: No Floating Point Exceptions have been reported - 99,219,938 cycles # 2.697 GHz - 229,292,514 instructions # 2.31 insn per cycle - 0.037291206 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7452) (512y: 146) (512z: 0) +TOTAL : 0.034958 sec + 100,134,440 cycles # 2.632 GHz + 229,125,035 instructions # 2.29 insn per cycle + 0.038647286 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7265) (512y: 126) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.191988e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.196872e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.196872e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.164156e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168925e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.168925e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.045809 sec -INFO: No Floating Point Exceptions have been reported - 88,834,257 cycles # 1.806 GHz - 128,615,199 instructions # 1.45 insn per cycle - 0.049747357 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2035) (512y: 122) (512z: 6355) +TOTAL : 0.046899 sec + 87,248,248 cycles # 1.750 GHz + 126,582,829 instructions # 1.45 insn per cycle + 0.050568011 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1946) (512y: 100) (512z: 6276) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 107a77153b..0619b08e27 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-10-06_09:58:55 +DATE: 2025-10-11_17:00:50 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.910300e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.325267e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.783205e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.353699e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.078498e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.922999e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.523085 sec -INFO: No Floating Point Exceptions have been reported - 2,188,593,202 cycles # 2.883 GHz - 3,112,954,096 instructions # 1.42 insn per cycle - 0.817031478 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 130 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.530539 sec + 2,259,281,332 cycles # 2.839 GHz + 3,100,637,501 instructions # 1.37 insn per cycle + 0.855479528 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 124 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 -Avg ME (F77/GPU) = 0.14771956172964262 -Relative difference = 2.590743366698123e-07 +Avg ME (F77/GPU) = 0.14771956172964260 +Relative difference = 2.5907433685770594e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.066686e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.035589e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.035589e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.156775e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.205296e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.205296e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.278019 sec -INFO: No Floating Point Exceptions have been reported - 3,764,987,469 cycles # 2.931 GHz - 9,752,169,319 instructions # 2.59 insn per cycle - 1.285199771 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 341) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.400705 sec + 4,031,222,897 cycles # 2.869 GHz + 9,715,380,409 instructions # 2.41 insn per cycle + 1.406286157 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 406) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.478889e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.890818e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.890818e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.450099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.861491e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.861491e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.830635 sec -INFO: No Floating Point Exceptions have been reported - 2,356,582,684 cycles # 2.814 GHz - 5,959,230,788 instructions # 2.53 insn per cycle - 0.838030934 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1369) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.838337 sec + 2,350,240,123 cycles # 2.786 GHz + 5,962,397,870 instructions # 2.54 insn per cycle + 0.844193677 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1351) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.229956e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.271002e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.271002e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.162719e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.161528e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.161528e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.594206 sec -INFO: No Floating Point Exceptions have been reported - 1,695,017,656 cycles # 2.820 GHz - 3,345,002,918 instructions # 1.97 insn per cycle - 0.601755215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1499) (512y: 0) (512z: 0) +TOTAL : 0.600854 sec + 1,671,713,001 cycles # 2.758 GHz + 3,319,973,297 instructions # 1.99 insn per cycle + 0.606663801 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1492) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.272289e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.349942e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.349942e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.261662e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.349890e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.349890e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.586413 sec -INFO: No Floating Point Exceptions have been reported - 1,670,913,790 cycles # 2.815 GHz - 3,318,759,581 instructions # 1.99 insn per cycle - 0.594196558 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1375) (512y: 96) (512z: 0) +TOTAL : 0.577948 sec + 1,617,041,581 cycles # 2.773 GHz + 3,291,143,565 instructions # 2.04 insn per cycle + 0.583833732 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1367) (512y: 96) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.146635e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.068698e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.068698e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.100149e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.993172e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.993172e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.618076 sec -INFO: No Floating Point Exceptions have been reported - 1,426,424,228 cycles # 2.279 GHz - 2,470,718,173 instructions # 1.73 insn per cycle - 0.626622796 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 580) (512y: 60) (512z: 1021) +TOTAL : 0.615039 sec + 1,364,172,223 cycles # 2.200 GHz + 2,429,556,714 instructions # 1.78 insn per cycle + 0.620861975 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 583) (512y: 60) (512z: 1009) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt index 00276091a3..071e7697d0 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-10-06_09:59:08 +DATE: 2025-10-11_17:01:05 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.969963e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.427733e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.936447e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.417263e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.094810e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.959655e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.519852 sec -INFO: No Floating Point Exceptions have been reported - 2,172,307,830 cycles # 2.872 GHz - 3,081,950,905 instructions # 1.42 insn per cycle - 0.813507263 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.525108 sec + 2,234,624,938 cycles # 2.820 GHz + 3,124,481,460 instructions # 1.40 insn per cycle + 0.850037014 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 122 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 -Avg ME (F77/GPU) = 0.14771956172964262 -Relative difference = 2.590743366698123e-07 +Avg ME (F77/GPU) = 0.14771956172964260 +Relative difference = 2.5907433685770594e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.156288e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.045734e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.045734e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.289834e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.373214e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.373214e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.265578 sec -INFO: No Floating Point Exceptions have been reported - 3,747,828,201 cycles # 2.946 GHz - 9,632,221,913 instructions # 2.57 insn per cycle - 1.272810702 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 359) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.378734 sec + 3,995,674,296 cycles # 2.888 GHz + 9,595,338,306 instructions # 2.40 insn per cycle + 1.384441945 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 401) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.494739e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.931280e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.931280e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.457938e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.874008e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.874008e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.827695 sec -INFO: No Floating Point Exceptions have been reported - 2,378,817,913 cycles # 2.850 GHz - 5,912,991,474 instructions # 2.49 insn per cycle - 0.835517705 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1340) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.834586 sec + 2,348,281,075 cycles # 2.796 GHz + 5,903,694,010 instructions # 2.51 insn per cycle + 0.840556806 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1329) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.079942e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.957305e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.957305e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.178686e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.194593e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.194593e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.628333 sec -INFO: No Floating Point Exceptions have been reported - 1,788,933,654 cycles # 2.817 GHz - 3,328,376,953 instructions # 1.86 insn per cycle - 0.635862534 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1436) (512y: 0) (512z: 0) +TOTAL : 0.595816 sec + 1,665,750,464 cycles # 2.772 GHz + 3,289,499,758 instructions # 1.97 insn per cycle + 0.601728408 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1437) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.320640e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.437091e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.437091e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.254319e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.335615e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.335615e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.574324 sec -INFO: No Floating Point Exceptions have been reported - 1,653,934,067 cycles # 2.845 GHz - 3,291,054,827 instructions # 1.99 insn per cycle - 0.581926884 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1328) (512y: 96) (512z: 0) +TOTAL : 0.579487 sec + 1,624,326,903 cycles # 2.777 GHz + 3,265,891,511 instructions # 2.01 insn per cycle + 0.585419257 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1330) (512y: 96) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.152026e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.087565e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.087565e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.069886e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.953317e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.953317e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.611501 sec -INFO: No Floating Point Exceptions have been reported - 1,420,414,146 cycles # 2.296 GHz - 2,439,626,449 instructions # 1.72 insn per cycle - 0.619276325 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 547) (512y: 60) (512z: 1007) +TOTAL : 0.621553 sec + 1,373,190,892 cycles # 2.193 GHz + 2,413,828,053 instructions # 1.76 insn per cycle + 0.627336488 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 550) (512y: 60) (512z: 1005) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index bd2093b69b..6216dff6c8 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-10-06_09:59:44 +DATE: 2025-10-11_17:01:47 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.032605e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.087100e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.501992e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.174946e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.068173e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.272719e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486732e-01 +- 3.293572e-05 ) GeV^0 -TOTAL : 0.487961 sec -INFO: No Floating Point Exceptions have been reported - 2,048,884,733 cycles # 2.866 GHz - 2,915,076,407 instructions # 1.42 insn per cycle - 0.773529382 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 97 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.489126 sec + 2,124,007,963 cycles # 2.815 GHz + 2,945,321,471 instructions # 1.39 insn per cycle + 0.811539193 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 83 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477195e-01 -Avg ME (F77/GPU) = 0.14771956735057756 -Relative difference = 4.559355911674916e-07 +Avg ME (F77/GPU) = 0.14771956769982353 +Relative difference = 4.58299842099026e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.070270e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.039772e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.039772e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.779077e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.006315e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.006315e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 1.246364 sec -INFO: No Floating Point Exceptions have been reported - 3,688,263,957 cycles # 2.948 GHz - 9,604,598,454 instructions # 2.60 insn per cycle - 1.251819600 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.286813 sec + 3,697,266,650 cycles # 2.863 GHz + 9,611,683,530 instructions # 2.60 insn per cycle + 1.292373810 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956094773486 Relative difference = 2.643675256627469e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.214709e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.338045e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.338045e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.204438e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.350250e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.350250e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 0.563106 sec -INFO: No Floating Point Exceptions have been reported - 1,636,975,072 cycles # 2.881 GHz - 3,967,404,939 instructions # 2.42 insn per cycle - 0.568812477 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1579) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.567715 sec + 1,640,656,743 cycles # 2.864 GHz + 3,979,080,194 instructions # 2.43 insn per cycle + 0.573454265 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1553) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955861942843 Relative difference = 2.80129187869649e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.994371e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.295152e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.295152e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.953501e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.188885e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.188885e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.439594 sec -INFO: No Floating Point Exceptions have been reported - 1,256,321,725 cycles # 2.826 GHz - 2,497,438,777 instructions # 1.99 insn per cycle - 0.445252542 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1924) (512y: 0) (512z: 0) +TOTAL : 0.446090 sec + 1,257,376,904 cycles # 2.787 GHz + 2,504,409,181 instructions # 1.99 insn per cycle + 0.451851006 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1915) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955698961392 Relative difference = 2.9116235141448046e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.098864e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.632832e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.632832e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.026066e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.404220e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.404220e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.427898 sec -INFO: No Floating Point Exceptions have been reported - 1,236,536,318 cycles # 2.855 GHz - 2,473,365,360 instructions # 2.00 insn per cycle - 0.433705293 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1870) (512y: 1) (512z: 0) +TOTAL : 0.438014 sec + 1,235,323,979 cycles # 2.788 GHz + 2,479,535,477 instructions # 2.01 insn per cycle + 0.443692621 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1861) (512y: 1) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955698961392 Relative difference = 2.9116235141448046e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.931142e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.994223e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.994223e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.854396e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.809242e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.809242e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293561e-05 ) GeV^0 -TOTAL : 0.448530 sec -INFO: No Floating Point Exceptions have been reported - 1,079,279,667 cycles # 2.379 GHz - 2,073,684,661 instructions # 1.92 insn per cycle - 0.454351959 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1011) (512y: 5) (512z: 1292) +TOTAL : 0.460001 sec + 1,078,883,681 cycles # 2.321 GHz + 2,076,270,716 instructions # 1.92 insn per cycle + 0.465628674 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1014) (512y: 5) (512z: 1276) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955262403935 Relative difference = 3.207154680524219e-07 diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt index 2473496911..b9e5df5750 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-10-06_09:59:56 +DATE: 2025-10-11_17:02:06 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.032625e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.129649e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.575777e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.174766e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.032980e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.224739e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486732e-01 +- 3.293572e-05 ) GeV^0 -TOTAL : 0.481858 sec -INFO: No Floating Point Exceptions have been reported - 2,051,512,664 cycles # 2.885 GHz - 2,948,723,179 instructions # 1.44 insn per cycle - 0.768027645 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 86 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.489051 sec + 2,148,781,052 cycles # 2.834 GHz + 2,942,650,451 instructions # 1.37 insn per cycle + 0.815858067 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 83 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477195e-01 -Avg ME (F77/GPU) = 0.14771956525510177 -Relative difference = 4.4175008557828484e-07 +Avg ME (F77/GPU) = 0.14771956508047879 +Relative difference = 4.4056796011251757e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.212337e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.061006e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.061006e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.862221e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.017701e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.017701e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 1.227579 sec -INFO: No Floating Point Exceptions have been reported - 3,620,291,769 cycles # 2.937 GHz - 9,471,544,557 instructions # 2.62 insn per cycle - 1.233302650 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 367) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.273068 sec + 3,660,086,626 cycles # 2.864 GHz + 9,502,319,452 instructions # 2.60 insn per cycle + 1.278709233 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 370) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956094773486 Relative difference = 2.643675256627469e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.220343e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.350531e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.350531e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.092947e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.109735e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.109735e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 0.560958 sec -INFO: No Floating Point Exceptions have been reported - 1,637,220,191 cycles # 2.892 GHz - 3,933,324,289 instructions # 2.40 insn per cycle - 0.566799529 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1517) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.591777 sec + 1,671,501,463 cycles # 2.802 GHz + 3,947,247,316 instructions # 2.36 insn per cycle + 0.597353565 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1510) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955861942843 Relative difference = 2.80129187869649e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.995950e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.312007e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.312007e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.904335e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.013564e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.013564e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.438140 sec -INFO: No Floating Point Exceptions have been reported - 1,255,613,659 cycles # 2.833 GHz - 2,482,092,959 instructions # 1.98 insn per cycle - 0.443764126 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1817) (512y: 0) (512z: 0) +TOTAL : 0.451671 sec + 1,251,161,997 cycles # 2.741 GHz + 2,488,699,975 instructions # 1.99 insn per cycle + 0.457155054 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1819) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955698961392 Relative difference = 2.9116235141448046e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.087645e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.599722e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.599722e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.993855e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.299058e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.299058e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.428178 sec -INFO: No Floating Point Exceptions have been reported - 1,231,320,501 cycles # 2.843 GHz - 2,457,271,461 instructions # 2.00 insn per cycle - 0.433769891 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1773) (512y: 1) (512z: 0) +TOTAL : 0.440947 sec + 1,225,739,794 cycles # 2.746 GHz + 2,464,639,586 instructions # 2.01 insn per cycle + 0.448602225 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1777) (512y: 1) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955698961392 Relative difference = 2.9116235141448046e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.945345e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.024652e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.024652e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.880064e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.891083e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.891083e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293561e-05 ) GeV^0 -TOTAL : 0.444653 sec -INFO: No Floating Point Exceptions have been reported - 1,073,447,692 cycles # 2.387 GHz - 2,057,517,401 instructions # 1.92 insn per cycle - 0.450271011 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 906) (512y: 5) (512z: 1273) +TOTAL : 0.454521 sec + 1,073,931,359 cycles # 2.337 GHz + 2,059,749,623 instructions # 1.92 insn per cycle + 0.460150581 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 909) (512y: 5) (512z: 1267) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955262403935 Relative difference = 3.207154680524219e-07 diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 5ae4907c26..5e30b14ca9 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-10-06_09:59:20 +DATE: 2025-10-11_17:01:19 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.059495e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.307970e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.770458e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.446721e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.093075e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.939789e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.521991 sec -INFO: No Floating Point Exceptions have been reported - 2,182,804,723 cycles # 2.882 GHz - 3,091,712,352 instructions # 1.42 insn per cycle - 0.814546737 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 130 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.525703 sec + 2,236,736,054 cycles # 2.823 GHz + 3,119,267,572 instructions # 1.39 insn per cycle + 0.849597854 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 124 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 -Avg ME (F77/GPU) = 0.14771956187351573 -Relative difference = 2.5810037581511336e-07 +Avg ME (F77/GPU) = 0.14771956605979195 +Relative difference = 2.2976103415315142e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.006175e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.025890e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.025890e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.117543e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.151188e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.151188e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.286472 sec -INFO: No Floating Point Exceptions have been reported - 3,808,533,169 cycles # 2.945 GHz - 9,779,238,528 instructions # 2.57 insn per cycle - 1.294044616 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 341) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.406267 sec + 4,043,925,432 cycles # 2.865 GHz + 9,738,556,635 instructions # 2.41 insn per cycle + 1.412149316 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 406) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956645541506 Relative difference = 2.270828308707201e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.477969e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.892042e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.892042e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.480932e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.914447e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914447e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.834785 sec -INFO: No Floating Point Exceptions have been reported - 2,360,159,801 cycles # 2.803 GHz - 5,954,715,990 instructions # 2.52 insn per cycle - 0.842708021 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1412) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.824504 sec + 2,316,933,637 cycles # 2.792 GHz + 5,851,816,983 instructions # 2.53 insn per cycle + 0.830593669 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1366) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956645541506 Relative difference = 2.270828308707201e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.260391e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.350498e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.350498e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.246053e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.337007e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.337007e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.587860 sec -INFO: No Floating Point Exceptions have been reported - 1,670,861,769 cycles # 2.810 GHz - 3,283,918,691 instructions # 1.97 insn per cycle - 0.595426943 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1567) (512y: 0) (512z: 0) +TOTAL : 0.582389 sec + 1,613,472,858 cycles # 2.745 GHz + 3,206,778,468 instructions # 1.99 insn per cycle + 0.588460320 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1531) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956674392650 Relative difference = 2.2512972893324335e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.348300e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.498815e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.498815e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.322435e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.481610e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.481610e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.573278 sec -INFO: No Floating Point Exceptions have been reported - 1,645,784,221 cycles # 2.835 GHz - 3,247,832,958 instructions # 1.97 insn per cycle - 0.581347619 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1446) (512y: 101) (512z: 0) +TOTAL : 0.567372 sec + 1,569,665,304 cycles # 2.742 GHz + 3,175,442,225 instructions # 2.02 insn per cycle + 0.573184846 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1435) (512y: 101) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956674392650 Relative difference = 2.2512972893324335e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.143317e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.068862e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.068862e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.075660e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.951397e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951397e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.613179 sec -INFO: No Floating Point Exceptions have been reported - 1,394,199,360 cycles # 2.248 GHz - 2,406,597,613 instructions # 1.73 insn per cycle - 0.620673412 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 768) (512y: 64) (512z: 1063) +TOTAL : 0.621447 sec + 1,359,798,497 cycles # 2.170 GHz + 2,353,126,759 instructions # 1.73 insn per cycle + 0.627307566 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 738) (512y: 64) (512z: 1042) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956674392650 Relative difference = 2.2512972893324335e-07 diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt index 3e507cd882..3f206f95bd 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-10-06_09:59:32 +DATE: 2025-10-11_17:01:33 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.080757e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.449829e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.987143e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.462369e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.119008e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.948835e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.520335 sec -INFO: No Floating Point Exceptions have been reported - 2,182,231,478 cycles # 2.885 GHz - 3,097,447,003 instructions # 1.42 insn per cycle - 0.813407395 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.522593 sec + 2,229,764,062 cycles # 2.824 GHz + 3,122,707,099 instructions # 1.40 insn per cycle + 0.846718941 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 122 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 -Avg ME (F77/GPU) = 0.14771956187351573 -Relative difference = 2.5810037581511336e-07 +Avg ME (F77/GPU) = 0.14771956605979195 +Relative difference = 2.2976103415315142e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.967180e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.023779e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.023779e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.222292e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.282147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.282147e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.289771 sec -INFO: No Floating Point Exceptions have been reported - 3,794,201,935 cycles # 2.927 GHz - 9,666,542,351 instructions # 2.55 insn per cycle - 1.297077628 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 359) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.390029 sec + 4,041,827,914 cycles # 2.897 GHz + 9,620,480,831 instructions # 2.38 insn per cycle + 1.395839351 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 401) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956645541506 Relative difference = 2.270828308707201e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.583493e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.064503e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.064503e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.484588e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.916467e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.916467e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.784715 sec -INFO: No Floating Point Exceptions have been reported - 2,328,374,642 cycles # 2.942 GHz - 5,878,440,022 instructions # 2.52 insn per cycle - 0.792155161 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1371) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.821088 sec + 2,277,892,232 cycles # 2.757 GHz + 5,806,859,822 instructions # 2.55 insn per cycle + 0.826926685 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1349) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956645541506 Relative difference = 2.270828308707201e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.254464e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.329047e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.329047e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.285308e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.418349e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.418349e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.590226 sec -INFO: No Floating Point Exceptions have been reported - 1,689,754,472 cycles # 2.827 GHz - 3,255,343,739 instructions # 1.93 insn per cycle - 0.598325338 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1483) (512y: 0) (512z: 0) +TOTAL : 0.573049 sec + 1,611,028,972 cycles # 2.786 GHz + 3,186,162,266 instructions # 1.98 insn per cycle + 0.579129244 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1474) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956674392650 Relative difference = 2.2512972893324335e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.345727e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.502859e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.502859e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.356503e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.544553e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.544553e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.572126 sec -INFO: No Floating Point Exceptions have been reported - 1,634,040,486 cycles # 2.820 GHz - 3,219,951,921 instructions # 1.97 insn per cycle - 0.580193189 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1382) (512y: 101) (512z: 0) +TOTAL : 0.558398 sec + 1,559,160,941 cycles # 2.767 GHz + 3,150,562,622 instructions # 2.02 insn per cycle + 0.564070384 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1373) (512y: 101) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956674392650 Relative difference = 2.2512972893324335e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.168828e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.118471e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.118471e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.173215e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.148914e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.148914e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.609357 sec -INFO: No Floating Point Exceptions have been reported - 1,417,478,840 cycles # 2.299 GHz - 2,399,490,515 instructions # 1.69 insn per cycle - 0.617376810 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 716) (512y: 64) (512z: 1056) +TOTAL : 0.596537 sec + 1,348,900,555 cycles # 2.242 GHz + 2,335,239,112 instructions # 1.73 insn per cycle + 0.602236132 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 687) (512y: 64) (512z: 1030) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956674392650 Relative difference = 2.2512972893324335e-07 diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index 607647c622..e3ea0d9299 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:56:33 +DATE: 2025-10-11_16:57:54 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.270000e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.214418e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.893995e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.706908e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.160258e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.561103e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.542175 sec -INFO: No Floating Point Exceptions have been reported - 2,178,993,269 cycles # 2.803 GHz - 3,108,059,533 instructions # 1.43 insn per cycle - 0.838052893 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.544889 sec + 2,278,331,746 cycles # 2.802 GHz + 3,194,429,442 instructions # 1.40 insn per cycle + 0.872956184 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 -Avg ME (F77/GPU) = 2.0158358666195562 -Relative difference = 6.616631711254798e-08 +Avg ME (F77/GPU) = 2.0158358666195557 +Relative difference = 6.616631733284825e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.830273e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.876984e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.876984e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.781718e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.827404e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.827404e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.836443 sec -INFO: No Floating Point Exceptions have been reported - 17,247,101,824 cycles # 2.952 GHz - 45,921,478,129 instructions # 2.66 insn per cycle - 5.842453521 seconds time elapsed +TOTAL : 5.994100 sec + 17,282,311,221 cycles # 2.881 GHz + 46,327,593,495 instructions # 2.68 insn per cycle + 5.999488168 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194407 Relative difference = 6.616637439061751e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.179372e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.338539e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.338539e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.117362e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.271065e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.271065e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.409251 sec -INFO: No Floating Point Exceptions have been reported - 10,038,815,546 cycles # 2.940 GHz - 27,809,165,185 instructions # 2.77 insn per cycle - 3.415697404 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2537) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.473625 sec + 10,058,480,748 cycles # 2.892 GHz + 27,928,334,913 instructions # 2.78 insn per cycle + 3.479625370 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2526) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194411 Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.016017e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.397611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.397611e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.891803e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.272223e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.272223e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.202025 sec -INFO: No Floating Point Exceptions have been reported - 6,083,216,423 cycles # 2.757 GHz - 12,595,496,799 instructions # 2.07 insn per cycle - 2.208459235 seconds time elapsed +TOTAL : 2.253673 sec + 6,113,479,898 cycles # 2.707 GHz + 12,619,681,498 instructions # 2.06 insn per cycle + 2.259543422 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2620) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.491994e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.947919e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.947919e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.064851e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.470121e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.470121e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.018419 sec -INFO: No Floating Point Exceptions have been reported - 5,588,215,007 cycles # 2.761 GHz - 12,004,808,489 instructions # 2.15 insn per cycle - 2.024606102 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2365) (512y: 144) (512z: 0) +TOTAL : 2.179283 sec + 5,867,669,279 cycles # 2.687 GHz + 12,194,655,166 instructions # 2.08 insn per cycle + 2.184803472 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2417) (512y: 124) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.529303e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.713663e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.713663e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.394256e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.568035e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.568035e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.082414 sec -INFO: No Floating Point Exceptions have been reported - 5,763,724,377 cycles # 1.867 GHz - 8,350,228,242 instructions # 1.45 insn per cycle - 3.088879573 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1468) (512y: 122) (512z: 1806) +TOTAL : 3.199079 sec + 5,758,256,477 cycles # 1.797 GHz + 8,312,435,809 instructions # 1.44 insn per cycle + 3.204885362 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1464) (512y: 100) (512z: 1805) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt index 3ed4c3c5ff..85796cb2e8 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:56:58 +DATE: 2025-10-11_16:58:23 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.306886e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.297289e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.977845e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.750318e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.090521e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.471741e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.530204 sec -INFO: No Floating Point Exceptions have been reported - 2,211,323,980 cycles # 2.884 GHz - 3,201,430,578 instructions # 1.45 insn per cycle - 0.823926524 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 212 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.536193 sec + 2,280,468,803 cycles # 2.831 GHz + 3,171,048,990 instructions # 1.39 insn per cycle + 0.862856350 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 -Avg ME (F77/GPU) = 2.0158358666195562 -Relative difference = 6.616631711254798e-08 +Avg ME (F77/GPU) = 2.0158358666195557 +Relative difference = 6.616631733284825e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.872475e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.921622e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.921622e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.830968e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.879197e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.879197e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.706755 sec -INFO: No Floating Point Exceptions have been reported - 16,797,600,798 cycles # 2.941 GHz - 44,912,592,336 instructions # 2.67 insn per cycle - 5.712473159 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 566) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.834979 sec + 16,842,100,019 cycles # 2.884 GHz + 45,296,854,647 instructions # 2.69 insn per cycle + 5.840673910 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194411 Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.376254e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.552215e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.552215e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.286582e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.457425e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.457425e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.215396 sec -INFO: No Floating Point Exceptions have been reported - 9,523,990,060 cycles # 2.957 GHz - 26,686,144,259 instructions # 2.80 insn per cycle - 3.221864250 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2326) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.299071 sec + 9,574,991,301 cycles # 2.898 GHz + 26,751,055,486 instructions # 2.79 insn per cycle + 3.304842345 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194411 Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.628485e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.953785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.953785e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.483668e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.795787e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.795787e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.376456 sec -INFO: No Floating Point Exceptions have been reported - 6,603,885,103 cycles # 2.772 GHz - 14,117,515,687 instructions # 2.14 insn per cycle - 2.382952116 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2705) (512y: 0) (512z: 0) +TOTAL : 2.446633 sec + 6,630,126,092 cycles # 2.705 GHz + 14,155,939,252 instructions # 2.14 insn per cycle + 2.452232412 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2708) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.799064e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.148539e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.148539e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.633646e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.966509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.966509e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.297050 sec -INFO: No Floating Point Exceptions have been reported - 6,386,723,525 cycles # 2.773 GHz - 13,726,619,432 instructions # 2.15 insn per cycle - 2.304339219 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2356) (512y: 298) (512z: 0) +TOTAL : 2.371147 sec + 6,420,781,885 cycles # 2.703 GHz + 13,756,522,591 instructions # 2.14 insn per cycle + 2.376767940 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2358) (512y: 297) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.339110e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.504311e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.504311e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.247851e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.404590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.404590e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.254444 sec -INFO: No Floating Point Exceptions have been reported - 5,974,020,045 cycles # 1.833 GHz - 10,122,964,274 instructions # 1.69 insn per cycle - 3.261538649 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1318) (512y: 208) (512z: 1986) +TOTAL : 3.336819 sec + 5,939,444,089 cycles # 1.778 GHz + 10,130,416,003 instructions # 1.71 insn per cycle + 3.342426568 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1321) (512y: 208) (512z: 1987) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 7bd4c9bca6..e92931017f 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:58:13 +DATE: 2025-10-11_16:59:57 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.178914e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.740854e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.866078e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.265470e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.796248e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925275e+08 ) sec^-1 MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 -TOTAL : 0.492488 sec -INFO: No Floating Point Exceptions have been reported - 2,067,407,730 cycles # 2.879 GHz - 2,921,575,837 instructions # 1.41 insn per cycle - 0.777094459 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 125 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.494715 sec + 2,133,928,532 cycles # 2.829 GHz + 2,961,237,291 instructions # 1.39 insn per cycle + 0.812186327 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 97 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015841e+00 -Avg ME (F77/GPU) = 2.0158787037944421 -Relative difference = 1.870375413642407e-05 +Avg ME (F77/GPU) = 2.0158787077525631 +Relative difference = 1.870571764492604e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.933137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.988210e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.988210e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.878391e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.930853e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.930853e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 -TOTAL : 5.510560 sec -INFO: No Floating Point Exceptions have been reported - 16,216,363,781 cycles # 2.940 GHz - 45,321,064,348 instructions # 2.79 insn per cycle - 5.516237540 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.670408 sec + 16,367,724,454 cycles # 2.885 GHz + 45,532,008,663 instructions # 2.78 insn per cycle + 5.675967017 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 Avg ME (F77/C++) = 2.0158491701586172 Relative difference = 8.441039850630506e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.554782e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.893509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.893509e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.407671e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.731067e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.731067e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 -TOTAL : 2.389253 sec -INFO: No Floating Point Exceptions have been reported - 7,056,712,623 cycles # 2.947 GHz - 17,792,064,584 instructions # 2.52 insn per cycle - 2.395009745 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3147) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.467869 sec + 7,095,747,201 cycles # 2.870 GHz + 17,858,347,842 instructions # 2.52 insn per cycle + 2.473312825 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3126) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 Avg ME (F77/C++) = 2.0158486895961687 Relative difference = 1.539816876576819e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.351394e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.496890e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.496890e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.089358e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.160867e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.160867e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.343765 sec -INFO: No Floating Point Exceptions have been reported - 3,745,450,403 cycles # 2.777 GHz - 8,262,540,860 instructions # 2.21 insn per cycle - 1.349671424 seconds time elapsed +TOTAL : 1.384690 sec + 3,760,865,125 cycles # 2.707 GHz + 8,296,401,814 instructions # 2.21 insn per cycle + 1.390188663 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3371) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015847e+00 Avg ME (F77/C++) = 2.0158474864438176 Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.821818e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.011140e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.011140e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.420631e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.588852e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.588852e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.275053 sec -INFO: No Floating Point Exceptions have been reported - 3,558,622,083 cycles # 2.780 GHz - 7,915,407,710 instructions # 2.22 insn per cycle - 1.280856743 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3214) (512y: 20) (512z: 0) +TOTAL : 1.334053 sec + 3,653,512,814 cycles # 2.729 GHz + 8,025,167,005 instructions # 2.20 insn per cycle + 1.339479555 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3272) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015847e+00 Avg ME (F77/C++) = 2.0158474864438176 Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.584138e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.256759e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.256759e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.300716e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.921877e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.921877e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.679646 sec -INFO: No Floating Point Exceptions have been reported - 3,255,689,642 cycles # 1.933 GHz - 6,101,216,288 instructions # 1.87 insn per cycle - 1.685383243 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2258) (512y: 22) (512z: 2156) +TOTAL : 1.752788 sec + 3,290,640,509 cycles # 1.873 GHz + 6,097,403,848 instructions # 1.85 insn per cycle + 1.758187036 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2262) (512y: 0) (512z: 2152) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015848e+00 Avg ME (F77/C++) = 2.0158476348733529 Relative difference = 1.8112806478434436e-07 diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt index bd2def4f48..890303a8f4 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:58:33 +DATE: 2025-10-11_17:00:25 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.136229e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.747823e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.880709e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.221580e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.787567e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.918978e+08 ) sec^-1 MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 -TOTAL : 0.488528 sec -INFO: No Floating Point Exceptions have been reported - 2,057,813,122 cycles # 2.874 GHz - 2,903,563,490 instructions # 1.41 insn per cycle - 0.774040886 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.494192 sec + 2,133,895,255 cycles # 2.826 GHz + 2,984,971,388 instructions # 1.40 insn per cycle + 0.812316425 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015841e+00 -Avg ME (F77/GPU) = 2.0158787037944421 -Relative difference = 1.870375413642407e-05 +Avg ME (F77/GPU) = 2.0158787077525631 +Relative difference = 1.870571764492604e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.970300e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.026987e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.026987e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.920936e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.975706e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.975706e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 -TOTAL : 5.407589 sec -INFO: No Floating Point Exceptions have been reported - 15,991,185,925 cycles # 2.955 GHz - 44,429,993,623 instructions # 2.78 insn per cycle - 5.412895968 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 533) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.545042 sec + 16,055,557,680 cycles # 2.893 GHz + 44,606,147,249 instructions # 2.78 insn per cycle + 5.550363279 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 534) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 Avg ME (F77/C++) = 2.0158491701586172 Relative difference = 8.441039850630506e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.328908e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.798682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.798682e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.166744e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.616602e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.616602e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 -TOTAL : 2.053409 sec -INFO: No Floating Point Exceptions have been reported - 6,061,427,520 cycles # 2.945 GHz - 17,076,312,832 instructions # 2.82 insn per cycle - 2.059026016 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2862) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.117207 sec + 6,107,535,010 cycles # 2.878 GHz + 17,151,265,141 instructions # 2.81 insn per cycle + 2.122735579 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 Avg ME (F77/C++) = 2.0158486895961687 Relative difference = 1.539816876576819e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.019252e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.594125e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.594125e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.890362e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.440713e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.440713e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.827330 sec -INFO: No Floating Point Exceptions have been reported - 5,036,041,688 cycles # 2.749 GHz - 10,223,391,747 instructions # 2.03 insn per cycle - 1.833165934 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3906) (512y: 0) (512z: 0) +TOTAL : 1.868040 sec + 5,037,008,594 cycles # 2.691 GHz + 10,256,105,804 instructions # 2.04 insn per cycle + 1.873591030 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3910) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015847e+00 Avg ME (F77/C++) = 2.0158474864438176 Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.156943e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.756865e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.756865e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.987209e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.558432e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.558432e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.789449 sec -INFO: No Floating Point Exceptions have been reported - 4,972,642,094 cycles # 2.772 GHz - 9,995,367,434 instructions # 2.01 insn per cycle - 1.795052964 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3805) (512y: 2) (512z: 0) +TOTAL : 1.838312 sec + 4,976,298,083 cycles # 2.700 GHz + 10,027,200,665 instructions # 2.01 insn per cycle + 1.843999254 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3807) (512y: 2) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015847e+00 Avg ME (F77/C++) = 2.0158474864438176 Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.670992e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.000057e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.000057e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.543540e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.857388e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.857388e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 2.331763 sec -INFO: No Floating Point Exceptions have been reported - 4,369,500,962 cycles # 1.870 GHz - 8,444,287,674 instructions # 1.93 insn per cycle - 2.337616992 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2744) (512y: 4) (512z: 2754) +TOTAL : 2.395195 sec + 4,386,171,031 cycles # 1.828 GHz + 8,457,161,359 instructions # 1.93 insn per cycle + 2.400661750 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2747) (512y: 4) (512z: 2749) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015848e+00 Avg ME (F77/C++) = 2.0158476348733529 Relative difference = 1.8112806478434436e-07 diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 9029ad668b..2e4f76055c 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:57:23 +DATE: 2025-10-11_16:58:53 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.278122e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.299718e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.972605e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.803206e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.197061e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.595248e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.535533 sec -INFO: No Floating Point Exceptions have been reported - 2,218,013,615 cycles # 2.871 GHz - 3,167,587,965 instructions # 1.43 insn per cycle - 0.830721869 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.542499 sec + 2,291,067,565 cycles # 2.822 GHz + 3,214,215,859 instructions # 1.40 insn per cycle + 0.903410898 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 -Avg ME (F77/GPU) = 2.0158358639104246 -Relative difference = 6.751024171044779e-08 +Avg ME (F77/GPU) = 2.0158359218521276 +Relative difference = 3.876697936613229e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.807535e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.852925e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.852925e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.773351e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.818033e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.818033e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.910224 sec -INFO: No Floating Point Exceptions have been reported - 17,388,420,068 cycles # 2.940 GHz - 46,077,588,135 instructions # 2.65 insn per cycle - 5.916245730 seconds time elapsed +TOTAL : 6.022953 sec + 17,468,685,186 cycles # 2.898 GHz + 46,428,017,151 instructions # 2.66 insn per cycle + 6.028694923 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359218686011 Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.226882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.387878e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.387878e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.098858e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.251324e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.251324e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.360909 sec -INFO: No Floating Point Exceptions have been reported - 9,940,043,952 cycles # 2.953 GHz - 27,598,360,403 instructions # 2.78 insn per cycle - 3.367569953 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.494063 sec + 10,018,252,515 cycles # 2.863 GHz + 27,545,325,597 instructions # 2.75 insn per cycle + 3.499809973 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359218686011 Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.038546e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.426797e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.426797e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.882400e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.252051e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.252051e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.194996 sec -INFO: No Floating Point Exceptions have been reported - 6,084,814,623 cycles # 2.765 GHz - 12,511,133,896 instructions # 2.06 insn per cycle - 2.201688699 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2776) (512y: 0) (512z: 0) +TOTAL : 2.257811 sec + 5,988,198,927 cycles # 2.647 GHz + 12,439,095,003 instructions # 2.08 insn per cycle + 2.263664182 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2756) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359178371690 Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.589922e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.068248e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.068248e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.259591e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.697101e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.697101e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 1.988387 sec -INFO: No Floating Point Exceptions have been reported - 5,540,380,764 cycles # 2.778 GHz - 11,938,541,192 instructions # 2.15 insn per cycle - 1.995322896 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2521) (512y: 146) (512z: 0) +TOTAL : 2.102985 sec + 5,735,490,837 cycles # 2.721 GHz + 12,004,650,662 instructions # 2.09 insn per cycle + 2.108573871 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2556) (512y: 126) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359178371690 Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.615006e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.807457e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.807457e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.518029e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.702687e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.702687e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.015683 sec -INFO: No Floating Point Exceptions have been reported - 5,630,115,254 cycles # 1.863 GHz - 8,130,918,173 instructions # 1.44 insn per cycle - 3.022730001 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1671) (512y: 126) (512z: 1865) +TOTAL : 3.089670 sec + 5,573,654,696 cycles # 1.801 GHz + 7,983,962,804 instructions # 1.43 insn per cycle + 3.095529304 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1826) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359178371690 Relative difference = 4.0758688308634e-08 diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt index 44aa1a6a94..09594959d7 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,210 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:57:48 +DATE: 2025-10-11_16:59:25 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.308177e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.314026e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.965515e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.800950e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.127229e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.485215e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.530653 sec -INFO: No Floating Point Exceptions have been reported - 2,220,013,015 cycles # 2.891 GHz - 3,185,773,009 instructions # 1.44 insn per cycle - 0.824701846 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 212 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.537601 sec + 2,294,644,932 cycles # 2.834 GHz + 3,202,661,173 instructions # 1.40 insn per cycle + 0.866738405 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 -Avg ME (F77/GPU) = 2.0158358639104246 -Relative difference = 6.751024171044779e-08 +Avg ME (F77/GPU) = 2.0158359218521276 +Relative difference = 3.876697936613229e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.857128e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.905464e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.905464e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.809865e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.856790e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.856790e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.753526 sec -INFO: No Floating Point Exceptions have been reported - 16,958,834,547 cycles # 2.945 GHz - 45,095,701,979 instructions # 2.66 insn per cycle - 5.759360611 seconds time elapsed +TOTAL : 5.902916 sec + 17,031,724,118 cycles # 2.883 GHz + 45,397,065,381 instructions # 2.67 insn per cycle + 5.908631173 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359218686011 Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.365466e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.544754e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.544754e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.294098e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.465793e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.465793e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.232551 sec -INFO: No Floating Point Exceptions have been reported - 9,533,065,833 cycles # 2.943 GHz - 26,273,852,197 instructions # 2.76 insn per cycle - 3.239846074 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2386) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.291976 sec + 9,561,103,669 cycles # 2.900 GHz + 26,144,822,297 instructions # 2.73 insn per cycle + 3.297670541 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2347) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359218686011 Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.514012e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.821697e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.821697e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.426643e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.734905e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.734905e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.435584 sec -INFO: No Floating Point Exceptions have been reported - 6,758,526,375 cycles # 2.768 GHz - 14,047,168,742 instructions # 2.08 insn per cycle - 2.442338814 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2895) (512y: 0) (512z: 0) +TOTAL : 2.478214 sec + 6,700,126,016 cycles # 2.700 GHz + 13,943,282,534 instructions # 2.08 insn per cycle + 2.483989370 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2871) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359178371690 Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.791737e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.138604e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.138604e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.620283e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.949819e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.949819e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.301242 sec -INFO: No Floating Point Exceptions have been reported - 6,403,253,635 cycles # 2.776 GHz - 13,529,712,107 instructions # 2.11 insn per cycle - 2.307614270 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2531) (512y: 302) (512z: 0) +TOTAL : 2.378094 sec + 6,404,718,099 cycles # 2.688 GHz + 13,458,943,081 instructions # 2.10 insn per cycle + 2.383779382 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2508) (512y: 302) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359178371690 Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.627313e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.823087e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.823087e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.539955e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.726603e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.726603e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.002431 sec -INFO: No Floating Point Exceptions have been reported - 5,614,669,392 cycles # 1.866 GHz - 9,218,497,811 instructions # 1.64 insn per cycle - 3.009264991 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1456) (512y: 212) (512z: 2059) +TOTAL : 3.070043 sec + 5,557,581,294 cycles # 1.808 GHz + 9,121,741,259 instructions # 1.64 insn per cycle + 3.075761617 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1425) (512y: 212) (512z: 2027) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359178371690 Relative difference = 4.0758688308634e-08 diff --git a/epochX/cudacpp/tput/teeThroughputX.sh b/epochX/cudacpp/tput/teeThroughputX.sh index 088371cb95..c4180b6725 100755 --- a/epochX/cudacpp/tput/teeThroughputX.sh +++ b/epochX/cudacpp/tput/teeThroughputX.sh @@ -1,8 +1,8 @@ #!/bin/bash -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. scrdir=$(cd $(dirname $0); pwd) bckend=$(basename $(cd $scrdir; cd ..; pwd)) # cudacpp or alpaka @@ -10,7 +10,7 @@ cd $scrdir function usage() { - echo "Usage: $0 [-nocuda] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-makeonly] [-makeclean] [-makej] [-dlp ]" # -nofpe is no longer supported + echo "Usage: $0 [-nocuda] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-noBlas|-blasOn] [-makeonly] [-makeclean] [-makej] [-scaling] [-dlp ]" # -nofpe is no longer supported exit 1 } @@ -33,8 +33,10 @@ helinls="0" hrdcods="0" rndgen= rmbsmp= +blas="" # build with blas but disable it at runtime steps="make test" makej= +scaling= ###nofpe= dlp= dlpset=0 @@ -117,6 +119,12 @@ for arg in $*; do rmbsmp=$arg elif [ "$arg" == "-bridge" ]; then rmbsmp=$arg + elif [ "$arg" == "-noBlas" ]; then # build with blas but disable it at runtime + if [ "${blas}" == "-blasOn" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi + blas=$arg + elif [ "$arg" == "-blasOn" ]; then # build with blas and enable it at runtime + if [ "${blas}" == "-noBlas" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi + blas=$arg elif [ "$arg" == "-makeonly" ]; then if [ "${steps}" == "make test" ]; then steps="make" @@ -131,6 +139,8 @@ for arg in $*; do fi elif [ "$arg" == "-makej" ]; then makej=-makej + elif [ "$arg" == "-scaling" ]; then + scaling=$arg ###elif [ "$arg" == "-nofpe" ]; then ### nofpe=-nofpe else @@ -175,6 +185,8 @@ for step in $steps; do args="${args} ${alpaka}" # optionally disable alpaka tests args="${args} ${rndgen}" # optionally use common random numbers or curand on host args="${args} ${rmbsmp}" # optionally use rambo or bridge on host + args="${args} ${scaling}" # optionally run scaling tests + args="${args} ${blas}" # optionally build with no blas or instead enable it at runtime ###args="${args} ${nofpe}" # optionally disable FPEs args="${args} ${bldall}" # avx, fptype, helinl and hrdcod are now supported for all processes if [ "${step}" == "makeclean" ]; then @@ -191,6 +203,8 @@ for step in $steps; do logfile=logs_${proc#-}_${sufflog}/log_${proc#-}_${sufflog}_${fptype}_inl${helinl}_hrd${hrdcod}.txt if [ "${rndgen}" != "" ]; then logfile=${logfile%.txt}_${rndgen#-}.txt; fi if [ "${rmbsmp}" != "" ]; then logfile=${logfile%.txt}_${rmbsmp#-}.txt; fi + if [ "${blas}" != "" ]; then logfile=${logfile%.txt}_${blas#-}.txt; fi + if [ "${scaling}" != "" ]; then logfile=${logfile%.txt}.scaling; fi printf "\n%80s\n" |tr " " "*" printf "*** ./throughputX.sh $args | tee $logfile" printf "\n%80s\n" |tr " " "*" diff --git a/epochX/cudacpp/tput/throughputX.sh b/epochX/cudacpp/tput/throughputX.sh index 68df662f58..5d870a48ab 100755 --- a/epochX/cudacpp/tput/throughputX.sh +++ b/epochX/cudacpp/tput/throughputX.sh @@ -1,8 +1,8 @@ #!/bin/bash -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. set +x # not verbose set -e # fail on error @@ -19,7 +19,7 @@ export MG5AMC_CHANNELID_DEBUG=1 function usage() { - echo "Usage: $0 [-bldall|-nocuda|-cpponly|-cudaonly|-hiponly|-noneonly|-sse4only|-avx2only|-512yonly|-512zonly] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-omp] [-makeonly|-makeclean|-makecleanonly|-dryrun] [-makej] [-3a3b] [-div] [-req] [-detailed] [-gtest(default)|-nogtest] [-v] [-dlp ]" # -nofpe is no longer supported + echo "Usage: $0 [-bldall|-nocuda|-cpponly|-cudaonly|-hiponly|-noneonly|-sse4only|-avx2only|-512yonly|-512zonly] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-noBlas|-blasOn] [-omp] [-makeonly|-makeclean|-makecleanonly|-dryrun] [-makej] [-3a3b] [-div] [-req] [-detailed] [-gtest(default)|-nogtest] [-scaling] [-v] [-dlp ]" # -nofpe is no longer supported exit 1 } @@ -49,7 +49,9 @@ fptypes="m" # new default #995 (was "d") helinls="0" hrdcods="0" rndgen="" -rmbsam="" +rmbsmp="" + +blas="" # build with blas but disable it at runtime maketype= makej= @@ -59,6 +61,7 @@ div=0 req=0 detailed=0 gtest= +scaling=0 ###nofpe=0 verbose=0 @@ -211,6 +214,14 @@ while [ "$1" != "" ]; do elif [ "$1" == "-bridge" ]; then rmbsmp=" -${1}" shift + elif [ "$1" == "-noBlas" ]; then # build without blas + if [ "${blas}" == "-blasOn" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi + blas=$1 + shift + elif [ "$1" == "-blasOn" ]; then # build with blas and enable it at runtime + if [ "${blas}" == "-noBlas" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi + blas=$1 + shift elif [ "$1" == "-makeonly" ] || [ "$1" == "-makeclean" ] || [ "$1" == "-makecleanonly" ] || [ "$1" == "-dryrun" ]; then if [ "${maketype}" != "" ] && [ "${maketype}" != "$1" ]; then echo "ERROR! Options -makeonly, -makeclean, -makecleanonly and -dryrun are incompatible"; usage @@ -245,6 +256,9 @@ while [ "$1" != "" ]; do fi gtest=0 shift + elif [ "$1" == "-scaling" ]; then + scaling=1 + shift ###elif [ "$1" == "-nofpe" ]; then ### nofpe=1 ### shift @@ -371,6 +385,9 @@ function showdir() echo $dir } +echo MADGRAPH_CUDA_ARCHITECTURE=${MADGRAPH_CUDA_ARCHITECTURE} +echo MADGRAPH_HIP_ARCHITECTURE=${MADGRAPH_HIP_ARCHITECTURE} + ###echo -e "\n********************************************************************************\n" printf "\n" @@ -434,6 +451,13 @@ done # PART 2 - build the executables which should be run ########################################################################## +if [ "${blas}" == "-noBlas" ]; then + export HASBLAS=hasNoBlas +else + export HASBLAS=hasBlas +fi +echo HASBLAS=${HASBLAS} + unset GTEST_ROOT unset LOCALGTEST @@ -497,6 +521,18 @@ if [ "${maketype}" != "-dryrun" ]; then printf "DATE: $(date '+%Y-%m-%d_%H:%M:%S')\n\n" fi +echo HASBLAS=${HASBLAS} + +if [ "${blas}" == "-blasOn" ]; then + export CUDACPP_RUNTIME_BLASCOLORSUM=1 +else + unset CUDACPP_RUNTIME_BLASCOLORSUM +fi +echo CUDACPP_RUNTIME_BLASCOLORSUM=${CUDACPP_RUNTIME_BLASCOLORSUM} + +unset CUDACPP_RUNTIME_CUBLASTF32TENSOR +echo CUDACPP_RUNTIME_CUBLASTF32TENSOR=${CUDACPP_RUNTIME_CUBLASTF32TENSOR} + function runExe() { exe1=$1 args="$2" @@ -507,6 +543,7 @@ function runExe() { # Optionally add other patterns here for some specific configurations (e.g. clang) if [ "${exe1%%/check_cuda*}" != "${exe1}" ] || [ "${exe1%%/check_hip*}" != "${exe1}" ]; then pattern="${pattern}|EvtsPerSec\[Matrix"; fi pattern="${pattern}|Workflow" + ###pattern="${pattern}|BLASCOLORSUM" ###pattern="${pattern}|CUCOMPLEX" ###pattern="${pattern}|COMMON RANDOM|CURAND HOST \(CUDA" pattern="${pattern}|ERROR" @@ -523,7 +560,7 @@ function runExe() { if [ "${detailed}" == "1" ]; then pattern="${pattern}|#"; fi if [ "${verbose}" == "1" ]; then set -x; fi ###perf stat -d $exe1 $args 2>&1 | grep -v "Performance counter stats" - perf stat -d $exe1 $args 2>&1 | egrep "(${pattern})" | grep -v "Performance counter stats" + perf stat -d $exe1 $args 2>&1 | egrep "(${pattern})" | grep -v "Performance counter stats" |& sed 's/.*rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/' set +x else # -- Older version using time @@ -539,6 +576,7 @@ function runTest() { echo "runTest $exe1" if [ "${maketype}" == "-dryrun" ]; then return; fi pattern="PASS|FAIL" + ###pattern="${pattern}|BLASCOLORSUM" pattern="${pattern}|ERROR" pattern="${pattern}|WARNING" pattern="${pattern}|Floating Point Exception" @@ -563,10 +601,12 @@ function cmpExe() { echo "ERROR! C++ calculation (C++${tag} failed"; exit 1 # expose FPE crash #1003 on HIP fi me1=$(cat ${tmp1} | grep MeanMatrix | awk '{print $4}'); cat ${tmp2} + ###cat ${tmp1} | grep BLASCOLORSUM if ! ${exef} ${argsf} 2>${tmp2} >${tmp1}; then echo "ERROR! Fortran calculation (F77${tag} failed"; exit 1 fi me2=$(cat ${tmp1} | grep Average | awk '{print $4}'); cat ${tmp2} + ###cat ${tmp1} | grep BLASCOLORSUM echo -e "Avg ME (C++${tag} = ${me1}\nAvg ME (F77${tag} = ${me2}" if [ "${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77${tag} returned NaN"; exit 1 @@ -588,16 +628,23 @@ function runNcu() { args="$2" args="$args$rndgen$rmbsmp" echo "runNcu $exe1 $args" - if [ "${verbose}" == "1" ]; then set -x; fi - #$(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin|registers| sm)' | tr "\n" " " | awk '{print $1, $2, $3, $15, $17; print $1, $2, $3, $18, $20$19}' - set +e # do not fail on error - out=$($(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args) - echo "$out" | egrep '(ERROR|WARNING)' # NB must escape $out in between quotes - set -e # fail on error (after ncu and after egrep!) - out=$(echo "${out}" | egrep '(sigmaKin|registers| sm)' | tr "\n" " ") # NB must escape $out in between quotes - echo $out | awk -v key1="launch__registers_per_thread" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)}; print $1, $2, $3, key1, val1}' - echo $out | awk -v key1="sm__sass_average_branch_targets_threads_uniform.pct" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)$(i+1)}; print $1, $2, $3, key1, val1}' - set +x + ###echoblas=1 + kernels="calculate_jamps color_sum_kernel" + ###if [ "${CUDACPP_RUNTIME_BLASCOLORSUM}" == "1" ]; then kernels="$kernels kernel"; fi # heavy to profile... + ###if [ "${CUDACPP_RUNTIME_BLASCOLORSUM}" == "1" ]; then kernels="$kernels regex:gemm"; fi # output to be improved... + for kernel in $kernels; do + if [ "${verbose}" == "1" ]; then set -x; fi + #$(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::${kernel}:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps|registers| sm)' | tr "\n" " " | awk '{print $1, $2, $3, $15, $17; print $1, $2, $3, $18, $20$19}' + set +e # do not fail on error + out=$($(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::${kernel}:" --kernel-name-base function $exe1 $args) + echo "$out" | egrep '(ERROR|WARNING)' # NB must escape $out in between quotes + ###if [ "${echoblas}" == "1" ]; then echo "$out" | egrep '(BLASCOLORSUM)'; echoblas=0; fi + set -e # fail on error (after ncu and after egrep!) + out=$(echo "${out}" | egrep "(${kernel}|registers| sm)" | tr "\n" " ") # NB must escape $out in between quotes + echo $out | awk -v key1="launch__registers_per_thread" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)}; print $1, $2, $3, key1, val1}' + echo $out | awk -v key1="sm__sass_average_branch_targets_threads_uniform.pct" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)$(i+1)}; print $1, $2, $3, key1, val1}' + set +x + done } # Profile divergence metrics more in detail @@ -613,11 +660,11 @@ function runNcuDiv() { ###echo "runNcuDiv $exe1 $args" if [ "${verbose}" == "1" ]; then set -x; fi ###$(which ncu) --query-metrics $exe1 $args - ###$(which ncu) --metrics regex:.*branch_targets.* --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args - ###$(which ncu) --metrics regex:.*stalled_barrier.* --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args - ###$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}' - #$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %-10s %s\n", "", $18, $19, $22$21; printf "%29s: %-51s %-10s %s\n", "", $28, $29, $32$31; printf "%29s: %-51s %-10s %s\n", "", $23, $24, $27$26; printf "%29s: %-51s %s\n", "", $33, $35}' - out=$($(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin| sm)' | tr "\n" " ") + ###$(which ncu) --metrics regex:.*branch_targets.* --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args + ###$(which ncu) --metrics regex:.*stalled_barrier.* --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args + ###$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}' + #$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %-10s %s\n", "", $18, $19, $22$21; printf "%29s: %-51s %-10s %s\n", "", $28, $29, $32$31; printf "%29s: %-51s %-10s %s\n", "", $23, $24, $27$26; printf "%29s: %-51s %s\n", "", $33, $35}' + out=$($(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps| sm)' | tr "\n" " ") ###echo $out echo $out | awk -v key1="smsp__sass_branch_targets.sum" '{key2=key1".per_second"; val1="N/A"; val2=""; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+1); if ($i==key2 && $(i+1)!="(!)") val2=$(i+2)$(i+1)}; printf "%29s: %-51s %-10s %s\n", "", key1, val1, val2}' echo $out | awk -v key1="smsp__sass_branch_targets_threads_uniform.sum" '{key2=key1".per_second"; val1="N/A"; val2=""; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+1); if ($i==key2 && $(i+1)!="(!)") val2=$(i+2)$(i+1)}; printf "%29s: %-51s %-10s %s\n", "", key1, val1, val2}' @@ -637,7 +684,7 @@ function runNcuReq() { for args in "-p 1 1 1" "-p 1 4 1" "-p 1 8 1" "-p 1 32 1" "$ncuArgs"; do ###echo "runNcuReq $exe1 $args" # NB This will print nothing if $args are invalid (eg "-p 1 4 1" when neppR=8) - $(which ncu) --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin|registers| sm|l1tex)' | tr "\n" " " | awk -vtag="[$args]" '{print $1, $2, $3, $16"s", $17";", $19"s", $20, tag}' + $(which ncu) --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps|registers| sm|l1tex)' | tr "\n" " " | awk -vtag="[$args]" '{print $1, $2, $3, $16"s", $17";", $19"s", $20, tag}' done set +x } @@ -659,10 +706,19 @@ else fi echo -e "On $HOSTNAME [CPU: $cpuTxt] [GPU: $gpuTxt]:" +# Configure scaling tests +if [ "${scaling}" == "0" ]; then # no scaling tests (throughput tests only) + exesSc= +elif [ "${scaling}" == "1" ]; then # scaling tests only (skip throughput tests) + exesSc=$exes + exes= +fi + # These two settings are needed by BMK containers: do not change them BMKEXEARGS="" # if BMKEXEARGS is set, exeArgs is set equal to BMKEXEARGS, while exeArgs2 is set to "" BMKMULTIPLIER=1 # the pre-defined numbers of iterations (including those in BMKEXEARGS) are multiplied by BMKMULTIPLIER +# (1) TRADITIONAL THROUGHPUT TESTS ###lastExe= lastExeDir= ###echo "exes=$exes" @@ -726,7 +782,7 @@ for exe in $exes; do exeArgs="-p 64 256 1" ncuArgs="-p 64 256 1" # For ggttgg (NEW): on GPU test both "64 256" and "2048 256" for ggttgg as the latter gives ~10% higher throughput on cuda110/gcc92 - exeArgs2="-p 2048 256 1" + ###exeArgs2="-p 2048 256 1" # Sep 2025: this aborts (and is not needed as the plateau is reached earlier) with helicity streams elif [ "${exe%%/gg_ttg*}" != "${exe}" ]; then # For ggttg, as on ggttgg: this is a good GPU middle point: tput is 1.5x lower with "32 256 1", only a few% higher with "128 256 1" ###exeArgs="-p 64 256 1" # too short! see https://its.cern.ch/jira/browse/BMK-1056 @@ -760,9 +816,16 @@ for exe in $exes; do unset OMP_NUM_THREADS fi elif [[ "${exe%%/check_cuda*}" != "${exe}" || "${exe%%/check_hip*}" != "${exe}" ]] || [ "${exe%%/alpcheck*}" != "${exe}" ]; then + echo "........................................................................." runNcu $exe "$ncuArgs" - if [ "${div}" == "1" ]; then runNcuDiv $exe; fi - if [ "${req}" == "1" ]; then runNcuReq $exe "$ncuArgs"; fi + if [ "${div}" == "1" ]; then + echo "........................................................................." + runNcuDiv $exe + fi + if [ "${req}" == "1" ]; then + echo "........................................................................." + runNcuReq $exe "$ncuArgs" + fi if [ "${exeArgs2}" != "" ]; then echo "........................................................................."; runExe $exe "$exeArgs2"; fi fi if [ "${gtest}" == "1" ]; then @@ -777,6 +840,51 @@ for exe in $exes; do cmpExe $exe fi done +###echo "=========================================================================" + +# (2) SCALING TESTS +lastExeDir= +for exe in $exesSc; do + if [ "$(basename $(dirname $exe))" != "$lastExeDir" ]; then + echo "=========================================================================" + lastExeDir=$(basename $(dirname $exe)) + else + echo "-------------------------------------------------------------------------" + fi + echo "scalingTest $exe" + if [ ! -f $exe ]; then echo "Not found: $exe"; continue; fi + if [ "${unamep}" != "x86_64" ]; then + if [ "${exe/build.avx2}" != "${exe}" ]; then echo "$exe is not supported on ${unamep}"; continue; fi + if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported on ${unamep}"; continue; fi + if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported on ${unamep}"; continue; fi + elif [ "${unames}" == "Darwin" ]; then + if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported on ${unames}"; continue; fi + if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported on ${unames}"; continue; fi + elif [ "$(grep -m1 -c avx512vl /proc/cpuinfo)" != "1" ]; then + if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported (no avx512vl in /proc/cpuinfo)"; continue; fi + if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported (no avx512vl in /proc/cpuinfo)"; continue; fi + fi + exeDir=$(dirname $exe) + cd $exeDir/.. # workaround for reading '../../Cards/param_card.dat' without setting MG5AMC_CARD_PATH + unset OMP_NUM_THREADS + # Scaling test with 256 threads per block + if [[ "${exe%%/check_cuda*}" != "${exe}" || "${exe%%/check_hip*}" != "${exe}" ]]; then + echo "### GPU: scaling test 256" + for b in 1 2 4 8 16 32 64 128 256 512 1024; do ( $exe -p $b 256 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 256}" ) |& sed "s/Gpu.*Assert/Assert/" |& sed 's/.*rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/'; done + if [[ "${exe%%/check_hip*}" != "${exe}" ]]; then + echo "### GPU: scaling test 64" + for b in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096; do ( $exe -p $b 64 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 64}" ) |& sed 's/.*rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/'; done # HIP (AMD GPU warp size is 32) + else + echo "### GPU: scaling test 32" + for b in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192; do ( $exe -p $b 32 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 32}" ) |& sed "s/Gpu.*Assert/Assert/"; done # CUDA (NVidia GPU warp size is 32) + fi + else + echo "### CPU: scaling test 256" + for b in 1 2 4; do ( $exe -p $b 256 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 256}" ); done + echo "### CPU: scaling test 32" + for b in 1 2 4; do ( $exe -p $b 32 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 32}" ); done + fi +done echo "=========================================================================" # Workaround for reading of data files

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "